Initial Project Flow Diagram

  • Diagram below outlines the dataset generation process shared with stakeholders and was used as a guide for this project

Inital Project Scope

Overview

  • Actual project work structure following flow diagram

Dataset Builder Flow

Step 1

Outline of Process

NOTE: This markdown will map the process to create multiple dataset timepoints

  • Retrieve/receive PIDNs and compile PIDN list

    • If PIDNs and dates are provided, extract PIDNs from list
  • Pull LAVA data required for timepoint function:

    • infoprocessingspeed

    • agingcogmeasures

    • neuropsychbedside

    • neuropsychcvlt

    • neuropsychmmse

    • udsneuropsychmoca

    • udsneuropsych

    • udscdr

  • Run timepoint function

Step 1 Overview

Import LAVA Data

  • These are needed to generate timepoints for the PIDNs
#read in the lava files downloaded to the R Drive
lava_files <- list.files(str_c(datafolder, 'LavaQuery_Outputs/lava_pull'),
                       recursive = FALSE, all.files = FALSE, full.names = TRUE)%>%
  str_subset('[~$]|\\.docx', negate = TRUE)%>%
  str_subset('.csv|.xlsx')


#save all the lava sheets into a list of dataframes
lava <- read_all_files(lava_files, clean_names = TRUE)%>%
  #set names to the sheet name without the suffix and '_mac_'
  set_names(~str_remove_all(.x,c('_mac_|_x.*')))%>%
  #convert dc_date to date datatype and rename it to DCDate if there is a dc_date column 
  map(function(x){
    if(any(names(x) == 'dc_date')){
      mutate(x,dc_date = as_date(dc_date))%>%
        rename(DCDate=dc_date)}
    else(x)})

Clean Duplicates

  • Mapping the clean_instrument function in hillblom_functions.R to the lava dataframes used for timepoint function
  • Use cleaned data for step one and reuse cleaned data for step two

Step 1 Instrument Flow

# get rid of duplicate entries by prioritizing entry with least NAs and visit types
tp_instruments <- c("infoprocessingspeed",
                    "agingcogmeasures",
                    "neuropsychbedside",
                    "neuropsychcvlt", 
                    "neuropsychmmse",
                    "udsneuropsychmoca",
                    "udsneuropsych",
                    "udscdr")

modify_at(lava, tp_instruments, clean_instrument, replace_neg_values = TRUE) -> lava

Gather PIDNs and DCDates

NOTE: If using data from the file used to extract PIDNs/PIDNs and Dates, store that data in environment

  • Store PIDNs/PIDNs and Dates in list

    • If only using dates provided to you, store PIDNs and Dates to original_data_no_tps
  • Rename dates to DCDate and convert to date data type

  • Create list of dfs

    • original_data_tps: Timepoints being generated here

    • original_data_no_tps: Only using pre-existing timepoints

#run this function to select file interactively 
#some_file<-read_file()

read_csv(str_c(datafolder,'step_one/BrANCH_dataset_pidns.csv')) -> original_data_BrANCH

# specimens
read_excel(str_c(datafolder,'step_one/specimens_dataset_pidns_data.xlsx'))%>%
   mutate(DCDate = as_date(SampleDate))%>%
   select(PIDN, DCDate, SampleID) -> original_data_specimens

# sleep
read_excel(str_c(datafolder,'step_one/sleep_dataset_pidns_data.xlsx'))%>%
  clean_names()%>%
  rename(PIDN = pidn, DCDate = sleep_study_date)%>%
  mutate(DCDate = as_date(DCDate)) -> original_data_sleep

# import fitbit data and store in environment
read_csv(str_c(datafolder,'Export-3-2-2023/final_fitbit_Export-3-2-2023.csv'))%>%
  filter(PIDN > 50)%>%
  ungroup() -> fitbit

# filter out ActAN data and any participant with project labeled excluded from Hillblom 
fitbit%>%
  filter(!(ProjectType == 'ActAN' & ProjectType != "EXCLUDED from Hillblom"))%>%
  group_by(PIDN)%>%
  mutate(Timepoint = zoo::index(PIDN)) -> original_data_fitbit 

if(exists('some_file')) 
  lst(original_data_BrANCH, original_data_specimens, some_file) else 
  lst(original_data_BrANCH, original_data_specimens) -> original_data_tps

original_data_no_tps <- lst(original_data_sleep, original_data_fitbit)

Compile Instrument Hierarchy

  • Mapping through the dataframes in original_data_tps
  • Creating hierarchy of instruments used to create timepoints
  • Creating variable indicating whether there are other data collection within six months
  • Creating grouping of instruments used to identify priority and sparse timepoints
  • Creating min_TPhierarchy to identify which highest value instrument at visit (lower is better)
timepoints_dfs <- map(original_data_tps, function(df){
  df%>%
  select(PIDN)%>%
  left_join(bind_rows(pluck(lava,'infoprocessingspeed')%>%
                            mutate(TPhierarchy = ifelse(anim_yes_med > 0, 1, NA)),
                        pluck(lava,'agingcogmeasures')%>%
                            mutate(TPhierarchy = ifelse((dot_total >= 0)|
                                                        (df_emp_c >= 0)|
                                                        (df_swch_c >= 0), 2, NA)),
                        pluck(lava,'neuropsychbedside')%>%
                            mutate(TPhierarchy = ifelse((mt_time >= 0) |
                                                        (mod_rey >= 0) |
                                                        (d_corr >= 0) |
                                                        (bnt_tot >= 0), 3, NA)),
                        pluck(lava,'neuropsychcvlt')%>%
                            mutate(TPhierarchy = ifelse((corr10 >= 0) |
                                                        (cv2lfrc >= 0), 4, NA)),
                        pluck(lava,'neuropsychmmse')%>%
                            mutate(TPhierarchy = ifelse(mmse_tot >= 0, 5, NA)),
                        pluck(lava,'udsneuropsychmoca')%>%
                            mutate(TPhierarchy = ifelse((mocatots >= 0) | 
                                                        (craftvrs >= 0) | 
                                                        (crafturs >= 0) |
                                                        (traila >= 0) | 
                                                        (minttots >= 0), 6, NA)),
                        pluck(lava,'udsneuropsych')%>%
                            mutate(TPhierarchy = ifelse((logimem >= 0) | 
                                                        (traila >= 0), 7, NA)),
                        pluck(lava,'udscdr')%>%
                            mutate(TPhierarchy = ifelse(cdrglob >= 0, 8, NA)))%>%
                  distinct(PIDN, DCDate, TPhierarchy)%>%
                  filter(!is.na(TPhierarchy)), by = 'PIDN')%>%
    arrange(PIDN, DCDate, TPhierarchy)})



timepoints_dfs <- map(timepoints_dfs, function(df) {
  df%>%
    group_by(PIDN)%>%
    #create logical if data within 6 month period of a timepoint to prevent sparse tps
    mutate(other_instruments_within_six_months = ifelse(
      difftime(DCDate, lag(DCDate), units = "days") <= 183 & !is.na(lag(DCDate)) |
      difftime(lead(DCDate),DCDate, units = "days") <= 183 & !is.na(lead(DCDate)), 1,0))%>%
    #get list of instruments for each datapoint
    group_by(PIDN, DCDate) %>%
    mutate(instruments = lst(TPhierarchy), 
           cat_A = all(TPhierarchy == 1 | TPhierarchy == 2), 
           cat_B = all(TPhierarchy == 3 | TPhierarchy == 4 | TPhierarchy == 5), 
           cat_C = all(TPhierarchy == 6 | TPhierarchy == 7 | TPhierarchy == 8),
           instrument_category = case_when(all(cat_A & !cat_B & !cat_C) ~ "a", 
                                           all(!cat_A & cat_B & !cat_C) ~ "b", 
                                           all(!cat_A & !cat_B & cat_C) ~ "c", 
                                           all(!cat_A & !cat_B & !cat_C) ~ "multiple"), 
           min_TPhierarchy = min(TPhierarchy))%>%
    distinct(PIDN, DCDate, instruments, min_TPhierarchy,
             other_instruments_within_six_months, instrument_category, cat_A, cat_B, cat_C)})

Nest Dataframes

  • Nesting a dataframe inside column of current dataframe containing data from previous visit and data within 180 days of previous visit
  • Nesting a dataframe inside column of current dataframe containing data that is within 180 days of current visit (backward or forward)
  • Nesting a dataframe inside column of current dataframe containing data that is within 180 to 365 days after current visit
  • Nesting a dataframe inside column of current dataframe containing data from next visit and visits within 180 days of next visit
timepoints_dfs <- furrr::future_map(timepoints_dfs, 
  function(tp_df) {
    purrr::reduce(list(tp_df,
                   find_data_within_range(df = tp_df,
                                          max_num_days = 180,
                                          column_name = timepoints_within_lag_or_lag_range,
                                          direction = 'backward',
                                          include_self = TRUE,
                                          inclusive = TRUE,
                                          lag_or_lead = 'lag'),
                   find_data_within_range(df = tp_df,
                                          max_num_days = 180,
                                          column_name = timepoints_within_six_months,
                                          direction = 'both',
                                          include_self = FALSE,
                                          inclusive = TRUE),
                   find_data_within_range(df = tp_df,
                                          max_num_days = 365,
                                          min_num_days = 180,
                                          column_name = timepoints_within_six_months_to_year,
                                          direction = 'forward',
                                          include_self = FALSE,
                                          inclusive = TRUE),
                   find_data_within_range(df = tp_df,
                                          max_num_days = 180,
                                          column_name = timepoints_within_lead_or_lead_range, 
                                          direction = 'forward', 
                                          lag_or_lead = 'lead', 
                                          inclusive = TRUE, 
                                          include_self = TRUE)),
              dplyr::left_join, by = c('PIDN', 'DCDate'))})

Create Logicals

  • Create logical for if any instruments in current are found in within previous visit or any visit within six months of previous visit
  • Create logical for if the current instrument-based groups (a, b, c) are in found within six months of the next visit
  • Create logical for if any instruments in current are found within six months of the next visit
furrr::future_map(
  timepoints_dfs,
  function(df) {
    df%>%
      #detects current instruments in previous visit or any visit within 6 months from previous visit
      mutate(any_instr_within_lag_or_six_months_from_lag = 
               #compare current instruments to data nested in 'timepoint_within_lag_range'
               map2_lgl(instruments,
                        pluck(timepoints_within_lag_or_lag_range),
                        #ANY instrument at current tp found in the 'instruments' inside nested df
                        function(current_instruments, instruments_within_range)
                          any(current_instruments %in% 
                                flatten_dbl(pluck(instruments_within_range,
                                                  'instruments', .default = lst())))),
             
             #detects whether current tp category is found in next visit or six months from next visit
             group_within_six_months_from_lead =
               #selects current timepoint instrument category
               map2_lgl(instrument_category,
                        #gathers dataframe nested in column
                        #'timepoints_within_lead_or_lead_range'
                        pluck(timepoints_within_lead_or_lead_range),
                        #if current timepoint has data from single instrument category, detect
                        #whether that instrument category collected at next visit or within
                        #months from next visit
                        function(current_category, dates_within_range)
                          any(str_subset(current_category, 'multiple', negate = TRUE) %in%
                                pluck(dates_within_range, 'instrument_category'))),
             
             # detects whether any instruments at current timepoint are found within next visit
             # or any visit within six months from next visit
             any_instr_within_lead_or_six_months_from_lead = 
               #compare current instruments
               map2_lgl(instruments,
                        #gather dataframe in nested in column
                        #'timepoints_within_lead_or_lead_range'
                        pluck(timepoints_within_lead_or_lead_range),
                        #compare current instruments to all 'instruments' within the next
                        #visit and visits within 180 days from next visit
                        function(current_instruments, dates_within_range) 
                          any(current_instruments %in% 
                                flatten_dbl(pluck(dates_within_range,
                                                  'instruments', .default = lst())))),
             
             #detects whether any instrument at current tp are present in any visit 6mo to year away
             any_instr_within_next_six_months_to_year = 
               #select current instruments to compare
               map2_lgl(instruments, 
                        #gather dataframe nested in column
                        #'timepoints_within_six_months_to_year'
                        pluck(timepoints_within_six_months_to_year),
                        #detect whether any of the current instruments are found within
                        #timepoints 180-365 days from current 
                        function(current_instruments, dates_within_range) 
                          any(current_instruments %in% flatten_dbl(pluck(dates_within_range,
                                                                         'instruments',
                                                                         .default = lst())))),
             
             #detects whether any instruments at current timepoint are found within any visit
             #within six months 
             any_instr_within_six_months = 
               #select current instrument 
               map2_lgl(instruments,
                        #gather df nested in column 'timepoints_within_six_months'
                        pluck(timepoints_within_six_months),
                        #detect whether any current instrument is found within 180 days in
                        #either direction from current visit
                        function(current_instruments, dates_within_range) 
                          any(current_instruments %in% flatten_dbl(pluck(dates_within_range,
                                                                         'instruments',
                                                                         .default = lst())))),
             
             #detects whether any timepoint within 180 days in either direction contains data
             #from more than one category
             any_timepoints_with_data_from_more_than_one_cat_within_six_months = 
               #gathers dataframe nested in column 'timepoints_within_six_months' and detects
               #whether any of those visits collected data from more than one category
               map_lgl(pluck(timepoints_within_six_months),
                       function(x) any(str_detect(pluck(x,'instrument_category'), 'multiple'))),
             
             #detects whether the current visit has the highest priority compared to all visits
             #within 180 days in either direction 
             highest_priority_within_six_months = 
               #compares current minimum timepoint hierarchy
               map2_lgl(min_TPhierarchy, 
                        #gathers dataframe nested in column 'timepoints_within_six_months' 
                        pluck(timepoints_within_six_months),
                        #compares current timepoint hierarchy to all visit timepoint hierarchy
                        #within 180 days in either direction 
                        function(current_min_tph, tps_within_range) 
                          all(current_min_tph <= (pluck(tps_within_range, 
                                                        'min_TPhierarchy')))))}) -> 
  timepoints_dfs

Generate Timepoints

  • create lag/lead timepoint variables
  • create logical for if next visit is more than 6 months but less than year apart
  • validating timepoints
    • valid
      • If more than 365 days from other timepoints
      • If current timepoint contains data from only one instrument category
        • and next/previous visit more than 365 days away
        • and no visits within 6 months from current have instruments from more than one category
        • compare other visits (only containing instruments from single category) within six months from current to see whether current timepoint has highest priority. If so, timepoint is valid.
      • If current timepoint contains data from only one instrument category
        • and two visits away in either direction are greater than 365 days away
        • and there are no instruments within six months from current that contain instruments from multiple categories
        • compare current timepoint to see whether it has highest priority compared to other timepoints within six months to see if current has priority
        • e.g. timepoint is nested between two timepoints that have instruments from only one category and the next visits after those are more than 365 days away
      • If timepoint has instruments from multiple categories and 180 days or more away from any other timepoint
      • If current timepoint contains data from only one instrument category
        • and any timepoint is 180-365 days in either direction
        • and closest timepoint contains same instrument as current timepoint
        • or timepoint within 180 days from closest timepoint contains same instrument as current timepoint
        • and no timepoints within 6 months of current have data from multiple instrument categories
        • and current timepoint has highest priority over others within six months
      • If current timepoint contains data from only one instrument category
        • and first/last visit on record
        • and instrument was recorded at next/previous visit or within 6 months from previous or next visit
      • If current timepoint contains data from only one instrument category
        • and first/last record on file
        • and if there are other instruments within six months,
        • and no visits containing more than one instrument category within six months
        • and this visit takes priority over those single instrument category visits within 6 months from current
      • If instrument was collected again within 180 days before or after current timepoint
      • If current timepoint collected instruments form more than one group
        • and the previous and next visits only contain instruments from one group
        • and the next visit is more than 180 days from next visit
        • and the previous visit is more than 180 days from previous visit
map(
  timepoints_dfs, 
  function(df) {
    df%>%                      
      group_by(PIDN)%>%
      #group by PIDN to compare participant data to itself 
      mutate(lag_timepoint_diff = as.numeric(difftime(DCDate, lag(DCDate) ,units = 'days')),
             #days from prev visit
             lead_timepoint_diff = as.numeric(difftime(lead(DCDate), DCDate, units = 'days')),
             #days from next visit
             lead_timepoint_within_range = 
               between(as.numeric(difftime(lead(DCDate), DCDate, units = 'days')), 180, 365),
             #if next visit between 180-365 days
             #using to check instruments that only have one category
             lag_timepoint_within_range = 
               between(as.numeric(difftime(DCDate, lag(DCDate), units = 'days')), 180, 365),
             #lag between 180-365---using to check instruments that only have one category
             
             #if current timepoint is more than a year away from other timepoints,meets criteria
             #of being isolated
             isolated_timepoint = if_else(
                 #set to valid if first timepoint and next is more than a year away
                 (is.na(lag_timepoint_diff) & 365 < lead_timepoint_diff) |
                   #valid if more than 365 days from previous visit and last visit on file 
                   (is.na(lead_timepoint_diff) & lag_timepoint_diff > 365) |
                   #valid if only participant timepoint
                   (is.na(lag_timepoint_diff) & is.na(lead_timepoint_diff)) |
                   #valid if no other visits within 365 days from current 
                   (lag_timepoint_diff > 365 & lead_timepoint_diff > 365), 1 ,0),
             
             #compare with priority of data within six months if next (x2) or previous (x2)
             #timepoint more than a year away or next (x2) & previous (x2) don't exist
             six_month_single_category_priority_far_neighbors = if_else(
                 #If current tp has instruments from different categories
                 instrument_category != 'multiple' & 
                   other_instruments_within_six_months &
                   highest_priority_within_six_months &
                   !any_timepoints_with_data_from_more_than_one_cat_within_six_months &
                   (lag_timepoint_diff > 365 |
                      lead_timepoint_diff > 365 |
                      ((as.integer(difftime(DCDate, lag(DCDate, n = 2), units = 'days')) > 365 |
                          is.na(lag(DCDate, n = 2))) & 
                         (as.integer(difftime(lead(DCDate, n = 2), DCDate, units = 'days')) > 365 |
                            is.na(lead(DCDate, n = 2))))), 1, 0),
             
             #if timepoint contains instruments from more than one instrument category and next
             #or previous visit is 180 days or more from current or doesn't exist
             multi_cat_neighbors_six_months_plus = if_else(
               (lag_timepoint_diff >= 180 | lead_timepoint_diff >= 180 |
                  is.na(lead_timepoint_diff) | is.na(lag_timepoint_diff)) &
                 instrument_category == 'multiple', 1, 0),
             
             #if takes priority over tps within six months and instrume was repeated in lag/lead
             single_cat_priority_repeated_instr_lag_lead_range = if_else(
               #If current visit has data from more than one category
               (instrument_category != 'multiple' &
                  #no multicategory visits within six months in either direction
                  !any_timepoints_with_data_from_more_than_one_cat_within_six_months &
                  #highest priority over all visits within six months in either direction 
                  highest_priority_within_six_months &
                  #next visit is within 180-365 days and any of current instruments are
                  #found in next visit or within 180 days after next visit
                  ((lead_timepoint_within_range &
                      any_instr_within_lead_or_six_months_from_lead) |
                     #previous visit is within 180-365 days and any of current instruments
                     #are found in previous visit or within 180 days before previous visit
                     (lag_timepoint_within_range &
                        any_instr_within_lag_or_six_months_from_lag) |
                     #other timepoints within 6 months from current timepoint exist and
                     #current timepoint is last or first visit 
                     (other_instruments_within_six_months &
                        (is.na(lead_timepoint_diff) |
                           is.na(lag_timepoint_diff))))) |
                 #no multicategory visits within six months in either direction
                 (instrument_category != 'multiple' &
                    #current visit is last visit and any of the current instruments are
                    #found in the previous visit or within 180 days before previous visit 
                    ((is.na(lead_timepoint_diff) &
                        any_instr_within_lag_or_six_months_from_lag) |
                       #current visit is first visit and any of the current instruments
                       #are found in the next visit or within 180 days after next visit
                       (is.na(lag_timepoint_diff) &
                          any_instr_within_lead_or_six_months_from_lead))), 1, 0),
             
             #if timepoint contains an instrument that was collected within six months (forward
             #or backward), criteria is met
             six_month_neighbor_instr_repeated_lag_lead_range = if_else(
               #catch all where instrument was repeated within 6 months from current date
               #backward or forward
               (lead_timepoint_diff < 180 &
                  any_instr_within_lead_or_six_months_from_lead) |
                 (lag_timepoint_diff < 180 &
                    any_instr_within_lag_or_six_months_from_lag), 1, 0),
             
             
             #valid if collected data from more than one category if immediate neighbors are
             #single category and next/previous visit from neighbor is more than 6 months away
             stuck_between_single_cat = if_else(
               instrument_category == 'multiple' &
                 lead(instrument_category) != 'multiple' & 
                 lag(instrument_category) != 'multiple' & 
                 (as.integer(difftime(DCDate, lag(DCDate, n = 2), units = 'days')) > 180 | 
                    is.na(lag(DCDate, n = 2))) &
                 (as.integer(difftime(lead(DCDate, n = 2), DCDate, units = 'days')) > 180 |
                    is.na(lead(DCDate, n = 2))), 1 ,0),
             
             #valid if any one of the criteria above met
             valid = if_else(isolated_timepoint == 1 |
                               single_cat_priority_repeated_instr_lag_lead_range == 1 |
                               multi_cat_neighbors_six_months_plus == 1 |
                               six_month_neighbor_instr_repeated_lag_lead_range == 1 |
                               stuck_between_single_cat == 1 |
                               six_month_single_category_priority_far_neighbors == 1, 1, 0))%>%
      filter(valid == 1)%>%
      distinct(PIDN, DCDate)}) ->
  timepoints_dfs 

Additional Timepoint Modification

  • Currently only used for specimens dataset
pluck(original_data_tps, 'original_data_specimens')%>%
  bind_rows(pluck(timepoints_dfs, 'original_data_specimens'))%>%
  arrange(PIDN, DCDate, SampleID)%>%
  group_by(PIDN)%>%
  filter(cumsum(!is.na(SampleID)) >= 1)%>%
  distinct(PIDN, DCDate, .keep_all = TRUE)%>%
  group_map(
    ~ mutate(.x, spec_tps = 
               lapply(DCDate, function(d) 
                 .x[(which((365 >= (abs(difftime(DCDate, d, units = 'days')))) & 
                             #makes sure current visit isn't included in own nested data
                             (0 != (abs(difftime(DCDate, d, units = 'days')))))),])), 
    .keep = TRUE)%>%
  bind_rows()%>%
  filter((!is.na(SampleID) | map_lgl(pluck(.,"spec_tps"), 
                                     function(x) all(is.na(pluck(x,'SampleID'))))))%>%
  distinct(PIDN, DCDate, SampleID) -> 
  pluck(timepoints_dfs, 'original_data_specimens')

Create Master PIDN/DCDate List

  • Used for filtering only relevant data when generating demographics
#creating list of pidns and dates to generate demographics later on
bind_rows(
  if(exists('timepoints_dfs')) bind_rows(map(timepoints_dfs, ~select(.x, PIDN, DCDate))),
  if(exists('original_data_no_tps')) 
    bind_rows(map(original_data_no_tps, ~select(.x, PIDN, DCDate))))%>%
  distinct(PIDN, DCDate)%>%
  arrange(PIDN, DCDate) ->
  all_pidns_dates 

Step 2

Step 2 Simplified Flow

LAVA data

LAVA Flow

Remove duplicated LAVA Data

# 1back, 2back, setshifting
custom_instruments <- c("1back", "2back", "setshifting")
instrum_list <- custom_instruments
custom_v_type_levels <- c('spatial cog', 'battery', 'a&c cog', 'tp1 cog')

modify_at(lava, custom_instruments,
          function(instrument) {
            clean_instrument(instrument, v_type_levels = custom_v_type_levels, replace_neg_values = TRUE)
            }) -> lava 

# Medications, subject demographics, and neuroexam
custom_instruments <- c("udsmedicalconditions", "udsmedicationsnonprescrver1", "udsmedicationsprescrver1",
                        "udsmedicationsvitasupver1", "udssubjectdemo", "adrcneuroexam")
instrum_list <- append(instrum_list, custom_instruments)
custom_v_type_levels <- c('neuroexam', 'adrc neuroexam', 'ppg neurological exam', 'diagnostic evaluation')

modify_at(lava, custom_instruments,
          function(instrument) {
            clean_instrument(instrument, v_type_levels = custom_v_type_levels, replace_neg_values = TRUE)
            }) -> lava 

# Bedsidescreen
custom_v_type_levels <- c('neuroexam', 'adrc neuroexam', 'ppg neurological exam', 'diagnostic evaluation')

clean_instrument(pluck(lava, 'bedsidescreen'),
                    v_type_levels = custom_v_type_levels, replace_neg_values = TRUE)%>%
  mutate(across(everything(), function(x) {replace(x, which(x < 0), NA)}))%>%
  fill(everything(), .direction = "up")%>%
  mutate(filled_with_instr_ids = str_c(instr_id, collapse = ", "))%>%
  relocate(filled_with_instr_ids, .after = instr_id)%>%
  distinct(PIDN, DCDate, .keep_all = TRUE) -> pluck(lava, 'bedsidescreen')


#cdr, npi, faq
custom_instruments <- c("cdr", "npi", "faq")
instrum_list <- append(instrum_list, custom_instruments)
custom_v_type_levels <- c('nursing', 'adrc nursing', '6 month', '12 month', 'informant', 'ppg informant interview', 'diagnostic evaluation')

modify_at(lava, custom_instruments,
          function(instrument) {
            clean_instrument(instrument, v_type_levels = custom_v_type_levels, replace_neg_values = TRUE)
            }) -> lava

# uds physical
custom_instruments <- c("udsphysical", "hbudsphysical")
instrum_list <- append(instrum_list, custom_instruments)
custom_v_type_levels <- c('neuroexam', 'adrc neuroexam', '12 month', 'hb intermediary cog', 'ppg neurological exam', 'a&c cog', 'tp1 cog')

modify_at(lava, custom_instruments,
          function(instrument) {
            clean_instrument(instrument, v_type_levels = custom_v_type_levels, replace_neg_values = TRUE)
            }) -> lava

# Process the remaining instruments with default v_type and dc_type levels
remaining_instruments <- setdiff(names(lava), instrum_list)
lava <- modify_at(lava, remaining_instruments, clean_instrument, replace_neg_values = TRUE)

Demographics

# Access the object from the global environment
#all_pidns_dates <- get("all_pidns_dates", envir = globalenv())


#joining uds subject demo to fill in blanks where possible
coalesce_join(pluck(lava, 'demographics')%>%
                ungroup()%>%
                select(PIDN, dob, deceased, hand, gender, educ, primary_language, testing_language, span_or, y_span_or, mult_rac, race_simple)%>%
                mutate(across(c(hand, educ, primary_language, race_simple, span_or, y_span_or), function(x) 
                  {replace(x, which(x<0 | x == 88 | x == 99), NA)})),
              pluck(lava, 'udssubjectdemo')%>%
                ungroup()%>%
                select(PIDN, DCDate, handed, educ, primlang, primlanx, race, racex, racesec, racesecx, raceter, raceterx, hispanic, hispor, hisporx)%>%
                #remove missing values 
                mutate(across(-PIDN, function(x) {replace(x, which(x < 0 | x == 88 | x == 99), NA)}))%>%
                #filter out rows missing data
                group_by(PIDN)%>%
                arrange(rowSums(is.na(.)), desc(DCDate))%>%
                select(-DCDate)%>%
                distinct(PIDN, .keep_all = TRUE)%>%
                mutate(hand = case_when(handed == 1 ~ 'LEFT',
                                        handed == 2 ~ 'RIGHT', 
                                        handed == 3 ~ 'AMBIDEXTROUS'),
                       primary_language = case_when(primlang == 1 ~ 'English',
                                                    primlang == 2 ~ 'Spanish',
                                                    primlang == 3 ~ 'Mandarin',
                                                    primlang == 4 ~ 'Cantonese', 
                                                    primlang == 5 ~ 'Russian', 
                                                    primlang == 6 ~ 'Japanese', 
                                                    primlang == 8 ~ primlanx), 
                       race_simple = case_when(race == 1 & is.na(racesec) ~ 1, 
                                               race == 2 & is.na(racesec) ~ 2),
                       span_or = case_when(hispanic == 0 ~ 2, 
                                           hispanic == 1 ~ 1),
                       y_span_or = case_when(hispor == 3 ~ 5, 
                                             hispor == 2 ~ 4, 
                                             hispor == 5 ~ 3, 
                                             hispor == 6 ~ 2))%>%
                select(PIDN, hand, primary_language, race, span_or, y_span_or), by = 'PIDN')%>%
  select(PIDN, dob, deceased, hand, gender, educ, primary_language, testing_language, span_or, y_span_or, mult_rac, race_simple) -> demographics_df 

# recode demographics for the pidns in environment
demographics_df%>%
  left_join(all_pidns_dates,
    by = 'PIDN')%>%
  select(PIDN, dob, matches('\\bRFM\\b'), deceased, hand, gender, educ, primary_language, testing_language, span_or, y_span_or, mult_rac, race_simple)%>%
  mutate(span_or_text = recode(span_or, '1' = 'Yes', '2' = 'No'),
         y_span_or_text = recode(y_span_or,
                                  '1' = 'North American',
                                  '2' = 'South American',
                                  '3' = 'Central American',
                                  '4' = 'Puerto Rican',
                                  '5' = 'Cuban',
                                  '6' = 'Haitian',
                                  '7' = 'Other Spanish/Hispanic/Latino'),
         race_simple_text = recode(race_simple,
                                  '1' = 'White',
                                  '2' = 'Black/AfricanAmerican',
                                  '3' = 'Asian Indian',
                                  '4' = 'Cambodian',
                                  '5' = 'Chinese',
                                  '6' = 'Filipino',
                                  '7' = 'Japanese',
                                  '8' = 'Hmong',
                                  '9' = 'Korean',
                                  '10' = 'Laotion',
                                  '11' = 'Vietnamese',
                                  '12' = 'Other Asian',
                                  '13' = 'Native Hawaiian',
                                  '14' = 'Guamamian',
                                  '15' = 'Somoan',
                                  '16' = 'Other Pacific Islander',
                                  '17' = 'American Indian',
                                  '18' = 'Other Race'))%>%
    arrange(PIDN, rowSums(is.na(.)))%>%
    distinct(PIDN, .keep_all = TRUE) -> demographics_curated_df

Brain Health Assessment

lava$brainhealthassessment%>%
  select(PIDN, DCDate, instr_type, v_type, digitsymbol_corr, digitsymbol_err, favorites_recall1,
  favorites_recall2, favorites_delay, favorites_total, favorites_recog_corr, favorites_recog_sme,
  favorites_recog_err, par_line_score, par_line_catchtrial, lo_score, lo_catchtrial)%>%
  mutate(across(c(digitsymbol_corr:lo_catchtrial), ~ifelse(. %in% c(-5), NA, .))) ->
  brainhealthassessment

CDR

lava$cdr%>%
  select(PIDN, DCDate, instr_type, cdr_tot, box_score)%>%
  mutate(across(c(cdr_tot, box_score), function(x) {replace(x, which(x < 0), NA)})) ->
  cdr

Clinical Labs

lava$hbclinicallabs_myelinucddra%>%
  select(PIDN, DCDate, instr_type, glucose_mg_d_l, hemoglobin_a1c_percent, insulin_u_u_ml, total_cholesterol_mg_dl,
         triglycerides, hdl_mg_d_l, ldl_mg_d_l, hs_crp_mg_l, cholesterol_hdl_ratio, non_hdl_cholesterol, homa_ir) ->
  clinical_labs

CPT

lava$cpt%>%
  select(PIDN, DCDate, total_corr, total_errors) ->
  cpt

Diagnosis

lava$diagnosis%>%
  select(PIDN, DCDate, clin_syn_best_est, clin_syn_sec_est, res_dx_a, res_dx_b)%>%
  na_if(-8) ->
  diagnosis 

Diagnosis Latest

lava$diagnosis_latest%>%
  ungroup()%>%
  select(PIDN, clin_syn_best_est, clin_syn_sec_est, res_dx_a, res_dx_b)%>%
  na_if(-8)%>%
  arrange(PIDN, rowSums(is.na(.)))%>%
  distinct(PIDN, .keep_all = TRUE)->
  diagnosis_latest

Early Dev History

lava$earlydevhistory%>%
  select(PIDN, DCDate, instr_type, edh_dys, edh_mot, edh_lan, edh_att, edh_imp, edh_anti, edh_shy,
         edh_dep, edh_anx, edh_obs, edh_oth, edh_read, edh_spell, edh_math, edh_f_lan, edh_mech,
         edh_sport, edh_music, edh_art, edh_hyp)%>%
  mutate(across(where(is.numeric), function(x) {replace(x, which(x < 0), NA)}))%>%
  filter(rowSums(across(edh_dys:edh_hyp, ~ !is.na(.))) > 0)%>%
  na_if(9) ->
  earlydevhistory

FAQ

lava$faq%>%
  select(PIDN, DCDate, faq_tot)%>%
  mutate(across(c(faq_tot), function(x) {replace(x, which(x < 0), NA)}))%>%
  mutate(across(c(faq_tot), function(x) {replace(x, which(x > 30), NA)}))%>%
  drop_na() ->
  faq

Fisherman Story

lava$fishermanstory%>%
  select(PIDN, DCDate, instr_type, total30min, totalweek)%>%
  mutate(across(c(total30min, totalweek), function(x) {replace(x, which(x < 0), NA)})) ->
  fishermanstory

Medications

bind_rows(
  lava$udsmedicationsprescrver1%>%
    select(-v_type, -dc_status, -age_at_dc, -instr_id)%>%
    mutate(across(c(pmeds:pmtpfu),
                  ~ifelse(. %in% c(-2, -3, -4, -5, -6, -7, -8, -9, 88, 95, 96, 97, 98, 888, 8888, 995, 996, 997, 998, 999), NA, .))),
  lava$udsmedicationsdetailsver2%>%
    pivot_wider(id_cols = c('PIDN', 'DCDate','instr_type'),
                names_from = 'drugid',
                names_glue = "drugid_{drugid}",
                values_from = c('not_listed'),
                values_fn = ~ifelse(all(is.na(.x)), paste(is.na(.x)), paste(lst(.x)))))%>%
    rename(drugid_NA = drugid_99999)%>%
    mutate(across(-c(instr_type, drugid_NA), ~as.logical(.)))%>%
    relocate(PIDN, DCDate, instr_type, gtools::mixedsort(names(.))) ->
  medications

Neuroexam

lava$adrcneuroexam%>%
  select(PIDN, DCDate, instr_type, updrs, psp_oc, psp_limb, psp_gait)%>%
  na_if(-5) ->
  adrcneuroexam

NPI

#npi -- joining the two npi instruments
full_join(lava$udsnpi%>%
            rename_with(~paste('uds',., sep = '_'), -c('PIDN':'instr_id'))%>%
            select(PIDN, DCDate, uds_npiqinf, uds_npiqinfx, uds_del, uds_delsev, uds_hall, uds_hallsev, 
                   uds_agit, uds_agitsev, uds_depd, uds_depdsev, uds_anx, uds_anxsev, uds_elat,
                   uds_elatsev, uds_apa, uds_apasev, uds_disn, uds_disnsev, uds_irr, uds_irrsev,
                   uds_mot, uds_motsev, uds_nite, uds_nitesev, uds_app, uds_appsev),
          lava$npi%>%
            select(PIDN, DCDate, delusn, dngr, steal, affair, vistrs, claim,
                   home, abandon, t_vfigs, del_oth, del_freq, del_sev, del_dis, del_totl,
                   hlcntns, voices, talk, see, smell, touch, taste, hal_oth, hal_freq,
                   hal_sev, hal_dis, hal_totl, agitate, resist, stubborn, help, behavior,
                   curse, throw, hit, ag_oth, ag_freq, ag_sev, ag_dis, ag_totl, dprssn,
                   tearful, sad, failure, punish, future, burden, death, dep_oth, dep_freq,
                   dep_sev, dep_dis, dep_totl, anxiety, worry, tense, sighing, nervous,
                   avoid, upset, anx_oth, anx_freq, anx_sev, anx_dis, anx_totl, euphoria,
                   happy, humor, giggle, jokes, pranks, truth, eup_oth, eup_freq, eup_sev,
                   eup_dis, eup_totl, apathy, spntns, convrs, emotion, chores, intrst, 
                   friends, enthuse, apth_oth, apth_freq, apth_sev, apth_dis, apth_totl, 
                   disinhibition, impulsiv, stranger, hurt, crude, openly, hug, dis_oth,
                   dis_freq, dis_sev, dis_dis, dis_totl, irritble, temper, mood, anger, 
                   coping, cranky, difficult, irr_oth, irr_freq, irr_sev, irr_dis, irr_totl,
                   motor, pace, rummage, clothing, habits, repetitive, fidget, mot_oth,
                   mot_freq, mot_sev, mot_dis, mot_totl, sleep, fall_asleep, night, wander,
                   awaken, start, early, day, sleep_oth, sle_frq, sle_sev, sle_dis, sle_totl,
                   eat, appetite, increase, wght_loss, wght_gain, change, food, food_type, 
                   eat_oth, eat_freq, eat_sev, eat_dis, eat_totl, total, dstrs_tot, npi_q), by = c('PIDN','DCDate'))%>%
  mutate(across(c(uds_npiqinf:npi_q), function(x) {replace(x, which(x < 0), NA)}))%>%
  mutate(across(-c(del_totl, hal_totl, ag_totl, dep_totl, anx_totl, dstrs_tot, total, eat_totl, sle_totl, mot_totl, irr_totl, dis_totl, apth_totl, eup_totl),
                ~na_if(.x, 9)))%>%
  arrange(PIDN, DCDate) -> 
  npi 

OTC Meds

#uds otc meds
lava$udsmedicationsnonprescrver1%>%
  select(-v_type, -dc_status, -age_at_dc, -instr_id)%>%
  mutate(across(c(nmeds:nmtfu), ~ifelse(. %in% c(-2, -3, -4, -5, -6, -7, -8, -9, 88, 95, 96, 97, 98, 888, 8888, 995, 996, 997, 998, 999), NA, .))) ->
  otc_meds

Pattern Separation

lava$patternseparation%>%
  mutate(LDI = p_sep_sim_lure   - p_sep_sim_foil)%>%
  select(PIDN, DCDate, instr_type, p_sep_sim_lure,  p_sep_sim_foil, LDI) ->
  patternseparation

Supplements and Vitamins

#uds vitamins data
lava$udsmedicationsvitasupver1%>%
  select(-v_type, -dc_status, -age_at_dc, -instr_id)%>%
  mutate(across(c(vitasups:vstfu), ~ifelse(. %in% c(-2, -3, -4, -5, -6, -7, -8, -9, 88, 95, 96, 97, 98, 888, 8888, 995, 996, 997, 998, 999), NA, .))) ->
  supplements

UDS Health History

lava$udshealthhistory%>%
  select(PIDN, DCDate, instr_type, tobac30, tobac100,smokyrs,packsper, quitsmok, alcoccas, alcfreq, cvhatt, hattmult,
         hattyear, cvafib, cvangio, cvbypass,cvpace, cvpacdef, cvchf, cvangina, cvhvalve, cvothr, cvothrx, cbstroke,
         strok1yr, strok2yr,strok3yr, strok4yr, strok5yr, strok6yr, strokmul, strokyr, cbtia, tia1yr, tia2yr, tia3yr,
         tia4yr, tia5yr, tia6yr, tiamult, tiayear, cbothr, cbothrx, pd, pdyr, pdothr,pdothryr, seizures, traumbrf,
         traumext, traumchr, tbi, tbibrief, tbiexten, tbiwolos, tbiyear,ncothr, ncothrx, hyperten, hypercho, diabetes,
         diabtype, b12def, thyroid, arthrit, arthtype,arthtypx, arthupex, arthloex, arthspin, arthunk, incontu, incontf,
         apnea, rbd, insomn,othsleep, othsleex, alcohol, abusothr, abusx, ptsd, bipolar, schiz, dep2yrs, depothr, anxiety,
         ocd, npsydev, psycdis, psycdisx)%>%
    mutate(across(c(tobac30:psycdisx),
                  ~ifelse(. %in% c(-2, -3, -4, -5, -6, -7, -8, -9, 9, 88, 95, 96, 97, 98, 99, 888, 8888, 995, 996, 997, 998, 999, 9999), NA, .))) ->
  health_history 

UDS Medical Conditions

lava$udsmedicalconditions%>%
  select(PIDN,DCDate, instr_type, cancer:othcondx, form_ver)%>%
  mutate(across(c(cancer:form_ver),  ~ifelse(. %in% c(-2, -3, -4, -5, -6, -7, -8, -9, 8, 9, 88, 95, 96, 97, 98, 888, 8888, 995, 996, 997, 998, 999), NA, .))) ->
  med_conditions

UDS Physical

bind_rows(lava$udsphysical, 
          lava$hbudsphysical)%>%
  select(PIDN, DCDate, instr_type, height, weight, bpsys, bpdias, hrate)%>%
  mutate(across(where(is.numeric), function(x) {replace(x, which(x < 0), NA)}))%>%
  mutate(across(where(is.numeric), ~ifelse(. %in% c(777, 88, 88.8, 888, 99.9, 999), NA, .)))%>%
  clean_instrument()->
  physical

Questionnaires

  • Run through 00_qualtrics_cleaning.rmd to read in and modify qualtrics instruments. A lot of the cleaning might only be needed retroactively, so moving forward, this might look different.

Qualtrics Flow

Qualtrics distribution tutorial

qualtrics_data <- readRDS(str_c(datafolder,'qualtrics_data_2023_03_14.RData'))
#/Volumes/macdata/projects/hillblom/Datasets/Dataset_Summer_2022qualtrics_data_2022_11_16.RData

Boston University TBI

  • R Drive data was read in in the qualtrics_cleaning.rmd
qualtrics_data$bu_subset_data%>%
  na_if(-99) -> 
  bu_subset_data 

OSU TBI

  • R Drive data was read in in the qualtrics_cleaning.rmd
qualtrics_data$osu_tbi_locpta_subset_data ->
  osu_tbi_locpta_subset_data

Perceived Stress Scale (PSS)

qualtrics_data$qualtrics_hap_perceived_stress_scale%>%
  mutate(instr_type = 'PSS-10')%>%
  select(PIDN, DCDate, source, instr_type, PSSTotal)%>%
  prioritize_qualtrics_data() ->
  pss

Social Network Index (SNI)

qualtrics_data$qualtrics_sni_social_network_index%>%
  ungroup()%>%
  #keep rows that less than five missing values
  filter(rowSums(is.na(.)) < 5)%>%
  prioritize_qualtrics_data() ->
  social_network_index

GRIT

qualtrics_data$qualtrics_hap_grit_scale%>%
    bind_rows(lava$grit%>%
                  filter(dc_status == 'Complete')%>%
                  mutate(source = 'lava'))%>%
  mutate(instr_type = 'GRIT')%>%
  select(PIDN, DCDate, instr_type, source, grit_tot, grit_consistence_tot, grit_persev_eff_tot, grit_score, grit_consistence_score, grit_persev_eff_score)%>%
  prioritize_qualtrics_data() ->
  grit

Edinburgh Handedness

qualtrics_data$qualtrics_edinburgh_handedness_inventory%>%
    bind_rows(lava$edinburghhandedness%>%
                  #replace negative values with NA
                  mutate(source = 'lava', 
                         across(everything(), function(x){ replace(x, which(x < 0), NA) }))%>%
                  #filter out 'scheduled'
                  filter(dc_status == 'Complete'))%>%
  select(PIDN, DCDate, source, instr_type, ehi1, ehi2, ehi3, ehi4, ehi5, ehi6, ehi7, ehi8, ehi9, ehi10, ehi11, ehi12, ehi_tot, natural_l, natural_r)%>%
  mutate(across(c(natural_l, natural_r), ~na_if(.x, 9)))%>%
  mutate(instr_type = 'Edinburgh Handedness')%>%
  prioritize_qualtrics_data() ->
  edinburgh_handedness

MIND Diet

qualtrics_data$qualtrics_hap_diet%>%
    bind_rows(lava$minddiet%>%
                filter(dc_status == 'Complete')%>%
                clean_names()%>%
                rename(DCDate = dc_date,
                       PIDN = pidn)%>%
                mutate(PIDN = as.numeric(PIDN), 
                       DCDate = as_date(DCDate),
                       source = 'lava'))%>%
  #see if there are any duplicates for PIDN/DCDate combos
  mutate(instr_type = 'MIND Diet')%>%
  select(PIDN, DCDate, instr_type, source, dietq_mindscore)%>% 
  prioritize_qualtrics_data() ->
  diet

GAD

qualtrics_data$qualtrics_gad%>%
  bind_rows(lava$gad7%>%
              mutate(source = 'lava'))%>%
  group_by(PIDN,DCDate)%>%
  #all v_type === SOICAL FX PSYCHIATRY, self_other == Self, instr_type == GAD7 so not including those columns
  select(PIDN, DCDate, source, gad7_1, gad7_2, gad7_3, gad7_4, gad7_5, gad7_6, gad7_7, gad7_tot)%>%
  mutate(across(c(gad7_tot), function(x) {replace(x, which(x < 0), NA)}))%>%
  mutate(instr_type = 'GAD-7')%>%
  select(PIDN, DCDate, instr_type, gad7_tot)%>% 
  prioritize_qualtrics_data() ->
  gad

MAAS Mindfulness

qualtrics_data$qualtrics_hap_maas_mindfulness%>%
  mutate(instr_type = 'MAAS', 
         maas_mindfulnesstot = sum(across(maas_notawareemotion:maas_snackwithoutattn)))%>%
  select(PIDN, DCDate, instr_type, maas_mindfulnesstot, maas_mindfulnessscore)%>%
  prioritize_qualtrics_data() ->
  maas_mindfulness

Physical Activity Scale (PASE)

qualtrics_data$qualtrics_hap_physical_activity_scale%>%
  select(PIDN, DCDate, source, pase_pase_total)%>%
  prioritize_qualtrics_data() ->
  pase

Cognitive Activity Scale (CAS)

qualtrics_data$qualtrics_hap_cognitive_activity_scale%>%
  bind_rows(lava$cognitiveactivity%>%
              filter(dc_status == 'Complete')%>%
              select(PIDN,DCDate,cas_game6:last_col())%>%
              mutate(across(everything(), function(x){replace(x, which(x<0), NA)}),
                     source = 'lava'))%>%
  ungroup()%>%
  rowwise()%>%
  mutate(across(c(cas_game6:cas_games_yr),
                function(column_value) {ifelse(column_value == 9, 0, column_value)}),
         cas_tot_pts = sum(across(
           c(cas_game6,
             cas_read6,
             cas_story6,
             cas_lib12,
             cas_paper12, 
             cas_mags12,
             cas_read12,
             cas_write12,
             cas_game12,
             cas_lib18,
             cas_paper18,
             cas_mags18,
             cas_read18,
             cas_write18,
             cas_game18,
             cas_paper40,
             cas_mags40, 
             cas_books40,
             cas_write40,
             cas_games40,
             cas_paper_yr,
             cas_mags_yr,
             cas_books_yr,
             cas_write_yr,
             cas_games_yr),
           function(column_value) { ifelse(column_value != 0, 6 - column_value, column_value)}), na.rm = FALSE),
         cas_score = as.integer(cas_tot_pts/sum(across(
           c(cas_game6,
             cas_read6,
             cas_story6,
             cas_lib12,
             cas_paper12, 
             cas_mags12,
             cas_read12,
             cas_write12,
             cas_game12,
             cas_lib18,
             cas_paper18,
             cas_mags18,
             cas_read18,
             cas_write18,
             cas_game18,
             cas_paper40,
             cas_mags40, 
             cas_books40,
             cas_write40,
             cas_games40,
             cas_paper_yr,
             cas_mags_yr,
             cas_books_yr,
             cas_write_yr,
             cas_games_yr),
           function(column_value) { column_value != 0 }), na.rm = FALSE)),
         lca_reading = sum(across(
           c(lca_paper,
             lca_mags,
             lca_books,
             lca_bks_yr,
             lca_lib_crd,
             lca_lib_crd,
             lca_bkcase,
             lca_bks_cnt)), na.rm = FALSE),
         lca_total = sum(across(
           c(lca_radio,
             lca_ra_news,
             lca_tv,
             lca_tv_news,
             lca_paper,
             lca_mags,
             lca_books,
             lca_bks_yr,
             lca_lib_crd,
             lca_bkcase,
             lca_bks_cnt)), na.rm = FALSE))%>%
  select(PIDN, DCDate, source, cas_tot_pts, cas_score, lca_reading, lca_total)%>%
  prioritize_qualtrics_data() ->
  cogntive_activity_scale

Everyday Cognitive Self Report

qualtrics_data$qualtrics_hap_everyday_cognition_patient_self_report_form%>%
  bind_rows(lava$everydaycogself%>%
  arrange(PIDN,DCDate)%>%
  select(PIDN,DCDate,ec_score:last_col())%>%
  mutate(source = 'lava'))%>%
  #select only cols that are in all sources
  select(PIDN, DCDate, source, ec_concern, ec_mem_score:ec_score)%>%
  #arrange by rows with most data if duplicates
  arrange(PIDN, DCDate, rowSums(is.na(.)))%>%  #replace negative values with NA
  mutate(across(everything(), function(x){replace(x, which(x<0), NA)}))%>%
  select(PIDN, DCDate, ec_concern, ec_mem_score, ec_lang_score, ec_vis_score, ec_plan_score, ec_org_score, ec_attn_score, ec_other_score, ec_score)%>%
  mutate(instr_type = 'Everyday Cog Self')%>%
  relocate(instr_type, .after = DCDate)%>%
  prioritize_qualtrics_data() -> everydaycogself

Sex and Reproductive Health

qualtrics_data$qualtrics_sex_and_reproductive_health_questions%>%
  #add check to replace invalid responses with NA
  mutate(sexQ_ageatlastperiod = ifelse(sexQ_ageatlastperiod < sexQ_children_ageatfirstchild |
                                         sexQ_ageatlastperiod < sexQ_ovaryremoval_age, NA, sexQ_ageatlastperiod), 
         sexQ_pregnancies_secondtrimester = ifelse(sexQ_pregnancies_secondtrimester > 40, NA, sexQ_pregnancies_secondtrimester))%>%
  prioritize_qualtrics_data() ->
  sex_and_reproductive_health

CHAMPS

qualtrics_data$qualtrics_hap_champs%>%
  #filter out lava data since it doesn't match--don't have dict for MET items
  filter(source != 'lava')%>%
  select(PIDN, DCDate, version, DCDate_physical, weight, date_diff_physical, TotalNum_Soc,
       TotalNum_Cog, TotalNum_Phys, TotalNum_ALL, WklyHrs_Soc, WklyHrs_Cog,
       WklyHrs_Phys, WklyHrs_ALL, New_Soc, New_Phys, New_Cog, New_ALL, WklyCount_Soc, WklyCount_Phys,
       WklyCount_Cog, WklyCount_ALL, METWeighted_Q13, METWeighted_Q15, METWeighted_Q20, METWeighted_Q21,
       METWeighted_Q24, METWeighted_Q25, METWeighted_Q26, METWeighted_Q27, METWeighted_Q28, METWeighted_Q29,
       METWeighted_Q30, METWeighted_Q31, METWeighted_Q32, METWeighted_Q33, METWeighted_Q34, METWeighted_Q35,
       METWeighted_Q36, METWeighted_Q37, METWeighted_Q38, METWeightcompleted, TotalCaloricExpenditure)%>%
  prioritize_qualtrics_data() ->
  champs

qualtrics_data <- map(qualtrics_data, ~ prioritize_qualtrics_data(.x))

R Drive Files

R Drive Flow

Virtual Bedside

virtual_bedside <- read_csv(str_c(datafolder,'HB Remote Bedside Data/HAP+Virtual+Bedside_June+10,+2022_17.10.csv'), col_select = 18:96, skip = 1)
#virtual bedside 
virtual_bedside%>%
  set_names(lst('PIDN','CenterID','DCDate','MOCACUBE','MOCACLOCK','MOCALANGUAGE','CV2T1C','CV2T2C','CV2T3C','CV2T4C','CV2T5C',
                'CV2TbC','CV2SFRC','CV2SDCC','CV2LFRC','CV2LDCC','CV2T1I','CV2T2I','CV2T3I','CV2T4I','CV2T5I','CV2TbI','CV2SFRI',
                'CV2SDCI','CV2LFRI','CV2LDCI','CV2Hit','CV2bR','CV2bU','CV2NP','CV2NU','Rey10m','ReyRecg','WRATWrd','WRATLet',
                'WRATIr','Syntax','Verbal','repeat','repeat5','minttots','minttotw','mintscng','mintscnc','mintpcng','mintpcnc',
                'LngPVrb','LngPDes','LngPAni','LngPIna','modRey','numbLoc','Calc','CATSFMTot','CATSAMTot','StrpCNCor','StrpCNErr',
                'StrpCor','StrpErr','StrpSCE','Dcorr','Dreps','DRuleV','behav1','behav2','behav3','behav4','behav5','behav6',
                'behav7','behav8','behav9','behav10','researchStatus','qualityIssueLogical','qualityIssue1','qualityIssue2',
                'qualityIssue3','qualityNotes'))%>%
  mutate(across(where(is.numeric), function(x) {replace(x, which(x < 0), NA)}))%>%
  mutate(DCDate = as_date(DCDate, format = '%m/%d/%Y'))%>%
  select(-c(CenterID, qualityIssue1, qualityIssue2, qualityIssue3, qualityIssueLogical, qualityNotes)) ->
  virtual_bedside

TabCAT

  • tabcat_unsynced data from R Drive and tabcat from tabCAT team
# join synced and unsynced tabcat data and seperate into list of dfs since multiple dates for a data collection period
tabcat_unsynced <- read_excel(str_c(datafolder,'TabCAT Data/EXAMINER Unsynced Data.xlsx'), guess_max = 1048576)

tabcat<- read_excel(str_c(datafolder,"TabCAT Data/MACResearch_TabCAT_2023_01_27.xlsx"), sheet = 'combo', col_types = 'text')
# join synced and unsynced tabcat data and seperate into list of dfs since multiple dates for a data collection period
tabcat_unsynced%>%
  rename(DCDate = `Visit Date`, 
         Flanker_TotalScore = "Flanker Total Score",
         RapidNaming_AvgReactionTime = "Rapid Naming Average Time",
         RapidNaming_TotalSkipped = "Rapid Naming Total Skipped",
         RapidNaming_TotalIncorrect = "Rapid Naming Total Incorrect",
         RapidNaming_TotalCorrect = "Rapid Naming Total Correct", 
         RunningDots_5Dot_PercentCorrect = "Running Dots 5 Dot % Correct", 
         RunningDots_4Dot_PercentCorrect = "Running Dots 4 Dot % Correct", 
         RunningDots_3Dot_PercentCorrect = "Running Dots 3 Dot % Correct", 
         RunningDots_2Dot_PercentCorrect = "Running Dots 2 Dot % Correct", 
         SetShifting_TotalScore = "Set Shifting Total Score",
         DotCounting_TotalScore = "Dot Counting Total Score")%>%
  mutate(DCDate = as_date(DCDate), 
         Flanker_TotalScore = as.numeric(Flanker_TotalScore))->
  tabcat_unsynced

tabcat%>%
  mutate(across(contains('date'),  ~ as.Date(as.numeric(.x), origin = "1899-12-30")))%>%
  rename(PIDN = Examinee_Identifier, DCDate = Encounter_StartDate)%>%
  mutate(across(where(is.character), function(x) {replace(x, which(x == 'NA' | x == 'N/A' | x == 'NaN' | x == 'null'), NA)}), 
         RapidNaming_AvgReactionTime = parse_number(str_remove(str_replace(RapidNaming_AvgReactionTime, ',', '.'), 'ms')))%>%
  type_convert(trim_ws = TRUE)%>%
  bind_rows(tabcat_unsynced) ->
  tabcat

#getting names of different instruments to extract into their own dfs and name the list item with the instrument
tabcat%>%
  select(!PIDN & !DCDate & !contains("Encounter") & !contains("Informant") & !contains('Examinee') & !contains('fav'))%>%
  colnames()%>%
  str_extract("[^_]+")%>%
  unique()%>%
  str_c('_')%>%
  str_replace('Composite_', 'BHA')%>%
  append('fav') ->
  tabcat_instruments

# set the extracted dfs in a list of dfs with the name of the insturment the name of the df
map(tabcat_instruments,
    ~ select(tabcat, c(PIDN, DCDate, contains(.x)))%>%
      filter(!if_all(-c(PIDN, DCDate), is.na)))%>%
  set_names(str_c('tabcat.', tabcat_instruments)%>%
              make_clean_names()) ->
  tabcat


tabcat$tabcat_bha -> tabcat_bha 
tabcat$tabcat_animal_fluency -> tabcat_animal_fluency

#consolidating dot counting data
full_join(lava$dotcounting%>%
              filter(!is.na(dot_counting_total) & dot_counting_total >= 0)%>%
              select(PIDN, DCDate, dot_counting_total)%>%
              rename(dot_counting_lava = dot_counting_total),
          tabcat$tabcat_dot_counting%>%
              filter(!is.na(DotCounting_TotalScore))%>%
              rename(dot_counting_total_tabcat = DotCounting_TotalScore), by = c('PIDN', 'DCDate'))%>%
    relocate(dot_counting_lava, dot_counting_total_tabcat, .after = DCDate) -> 
  dot_counting

#
tabcat$tabcat_flanker%>%
  mutate(across(c(Flanker_TaskDuration, Flanker_PTSet1_TotalCorrect, Flanker_PTSet2_TotalCorrect, Flanker_TotalScore, 
                  Flanker_Correct_Total, Flanker_Correct_MedianRT, Flanker_Correct_StDevRT, Flanker_CongrCorrect_Total, Flanker_CongrCorrect_MedianRT,
                  Flanker_IncongrCorrect_Total, Flanker_IncongrCorrect_MedianRT, Flanker_TotalScore_Z),
                ~ifelse(. == 'null'| . == 'N/A', NA, as.numeric(as.character(.)))),
         Flanker_PracticeTrialSuccess = as.logical(case_when(Flanker_PracticeTrialSuccess == '1' ~ 'TRUE', 
                                                    Flanker_PracticeTrialSuccess == '0' ~ 'FALSE',
                                                    TRUE ~ as.character(Flanker_PracticeTrialSuccess)))) ->
  tabcat_flanker


tabcat$tabcat_ll -> tabcat_ll 
tabcat$tabcat_lo -> tabcat_lo 

tabcat$tabcat_match%>%
  select(-c(Match_Correct_0to15:Match_Incorrect_105to120)) -> tabcat_match 

tabcat$tabcat_rapid_naming%>%
  select(-c(RapidNaming_T1_Score:RapidNaming_T60_VerbatimResponse)) -> tabcat_rapid_naming

tabcat$tabcat_running_dots -> 
  tabcat_running_dots

tabcat$tabcat_set_shifting -> tabcat_set_shifting

tabcat$tabcat_fav%>%
  select(-c(FavDelay_TaskVersion: FavDelay_animal4Score, FavRec_T1:FavRec_T24)) -> tabcat_fav 

Imaging

Imaging Markdown

#Torie put together
imaging_mci <- read_excel(str_c(datafolder,'MRI Datasets/MCI_dataset_03-2022.xlsx'), .name_repair = 'minimal')

#Corrina's imaging dataset
imaging_file <- c(str_c(datafolder,'/MRI Datasets/MRI_Dataset_Summer2020_updated.xlsm'))
old_imaging <- read_all_files(imaging_file,  prefix = 'imaging', 0, clean_names = TRUE)

#Data uploaded by Yann to box folder
#lists files in the MRI Datasets/box directory
mri_box_files = list.files(str_c(datafolder,'MRI Datasets/box'), include.dirs = TRUE, recursive = TRUE, full.names = TRUE)
mri_box_data <- read_all_files(mri_box_files, prefix = 'imaging', clean_names = FALSE)
# for wmh, separate out prisma and trio into two columns-also separated out sfva scanned wmh values into their own column. prioritizing older data, new data, then torie/mci data
# for pasl, no changes needed since no overlapping data between old and new
# for pcasl, only need to create new vars. prioritizing older data
# for dti v2 and v6, only including old data until new data reprocessed
# for t1, combine old and new. need guidance on vars that don't overlap. prioritizing older data. still some questions about larger differences e.g., brain_stem
# hold off on functional connectivity for now


#Torie put together
imaging_mci%>%
  subset(select = which(!duplicated(names(.))))%>%
  select(c(PIDN, ScannerID:last_col()))%>%
  rename(DCDate = Date)%>%
  relocate(DCDate, .after = PIDN)%>%
  mutate(DCDate = as_date(DCDate)) ->
  imaging_mci 
  
#Data from R Drive
old_imaging%>%
    set_names(names(.)%>%
                str_remove_all('_mri.*'))%>%
  map(.,function(x){
    if(any(names(x) == 'mri_date')){
      x%>%
        rename(DCDate = mri_date)%>%
        mutate(DCDate = as_date(DCDate))}})%>%
  keep(~'DCDate' %in% names(.x)) -> 
  old_imaging

names(old_imaging) <- c("imaging_t1", "imaging_wmh", "imaging_dti_v2", "imaging_dti_v6", "imaging_pcasl", "imaging_pasl", "imaging_fc_trio", "imaging_fc_prisma")

mri_box_data%>%
  map(~rename(.x, DCDate = matches('\\bdate\\b'))) ->
  mri_box_data


#redoing this to have all data modified
mri_box_data_modified <- mri_box_data

mri_box_data_modified$imaging_gm_all_production <- NULL
mri_box_data_modified$imaging_gm_all_production_2 <- NULL

mri_box_data_modified$imaging_gm_all_production <- 
  bind_rows(mri_box_data$imaging_gm_all_production,
            mri_box_data$imaging_gm_all_production_2%>%
              mutate(PIDN = as.numeric(PIDN)))%>%
  arrange(PIDN)#%>%
  #filter(Label == 'Sum')

#schaefer grey matter 
mri_box_data_modified$imaging_gm_schaefer <- NULL
mri_box_data_modified$imaging_gm_schaefer_2 <- NULL

mri_box_data_modified$imaging_gm_schaefer <- 
  bind_rows(mri_box_data$imaging_gm_schaefer,
            mri_box_data$imaging_gm_schaefer_2%>%
              mutate(PIDN = as.numeric(PIDN)))%>%
  arrange(PIDN)%>%
  filter(Label == 'Sum')

#qsm- not sure which to take from this. think average since dti..?
mri_box_data_modified$imaging_qsm_template <- NULL
mri_box_data_modified$imaging_qsm_template_2 <- NULL

mri_box_data_modified$imaging_qsm_template <- 
  bind_rows(mri_box_data$imaging_qsm_template,
            mri_box_data$imaging_qsm_template_2%>%
              mutate(PIDN = as.numeric(PIDN)))%>%
  arrange(PIDN)%>%
  filter(Label == 'Mean')

mri_box_data_modified_clean_names <- mri_box_data_modified%>%map(~clean_names(.x))


bind_rows(
    #yann said to take the wmh load from this? 
    mri_box_data_modified_clean_names$imaging_summary_trio%>%
        mutate(source = 'new_trio_summary', 
               scanner = 'trio')%>%
        rename(wmh_mm3_trio = wmh_mm3),
    #yann said to take the wmh load from this? 
    mri_box_data_modified_clean_names$imaging_summary_prisma%>%
        mutate(source = 'new_prisma_summary', 
               scanner = 'prisma')%>%
        rename(wmh_mm3_prisma = wmh_mm3,
               wmh_qc = qc_wmh,
               flair_qc = qc_flair),
    #yann said to take the wmh load from this? 
    mri_box_data_modified_clean_names$imaging_summary%>%
        mutate(source = 'new_prisma_summary',
               scanner = 'prisma')%>%
        rename(wmh_mm3_prisma = wmh_mm3),
    old_imaging$imaging_wmh%>%
        mutate(source = 'workbook_wmh', 
               scanner = str_extract(wm_hpipeline, "(?<=_)[^_]+$"))%>%
        mutate(wmh_mm3_prisma = ifelse(scanner == 'prisma', wmh_mm3, NA),
               wmh_mm3_trio = ifelse(scanner == 'trio', wmh_mm3, NA),
               scanner = ifelse(scanner == 'supervised' & scanner_id == 'SFVA 4T MRI', 'SFVA 4T MRI', scanner),
               wmh_mm3_SFVA = ifelse(scanner_id == 'SFVA 4T MRI', wmh_mm3, NA))%>%
        clean_names(),
    imaging_mci%>%
        mutate(source = 'mci_dataset')%>%
        rename(csf_mm3 = `CSV(mm3)`,
               wmh_mm3 = `WMH(mm3)`, 
               scanner = Scanner)%>%
        mutate(scanner = tolower(scanner),
               wmh_mm3_prisma = ifelse(scanner == 'prisma', wmh_mm3, NA),
               wmh_mm3_trio = ifelse(scanner == 'trio', wmh_mm3, NA))%>%
        clean_names())%>%
  mutate(scanner = tolower(scanner),
           #setting data to NA if data didn't pass quality control
           across(c(gm_mm3:icv_mm3),
                  ~ifelse((!is.na(wmh_qc) & wmh_qc == 0) |
                              (!is.na(flair_qc) & flair_qc == 0) |
                              (!is.na(qc) & qc == 0) |
                              (!is.na(proc_status_wmh) & proc_status_wmh == 0), NA, .)))%>%
  mutate(log_wmh = log(wmh_mm3))%>%
  arrange(pidn, dc_date, factor(source, levels = c('workbook_wmh', 'new_prisma_summary', 'new_trio_summary', 'mci_dataset')))%>%
  rename(PIDN = pidn, DCDate = dc_date, scanner_inferred = scanner)%>%
  relocate(source, scanner_inferred,  .after = DCDate)%>%
  select(PIDN, DCDate, source, source_id, wmh_mm3_trio, wmh_mm3_prisma, wmh_mm3_sfva)%>%
  filter(!is.na(wmh_mm3_trio) | !is.na(wmh_mm3_prisma) | !is.na(wmh_mm3_sfva))%>%
  distinct(PIDN, DCDate, .keep_all = TRUE) ->
  imaging_wmh


#pasl joining 
#new data does not have lh_ADROI.paslxsec   rh_ADROI.paslxsec   ADROI_bilat.paslxsec or globalCBF_nocerebellum.paslxsec
#old data doesn't have delta_t" "tiv"     "gm"      "wm"      "label"
bind_rows(old_imaging$imaging_pasl%>%
            clean_names()%>%
            rename_with(~str_remove_all(.x, '_paslxsec'))%>%
            mutate(source = 'workbook_pasl')%>%
            rename(DCDate = dc_date),
          mri_box_data_modified_clean_names$imaging_p_asl_cbf_pvc_gm_all_production%>%
            rename(DCDate = dc_date)%>%
            filter(label == 'Mean')%>%
            mutate(source = 'new_pasl'))%>%
  rename(PIDN = pidn)%>%
  arrange(PIDN, DCDate, factor(source, levels = c('workbook_pasl', 'new_pasl')))%>%
  relocate(source, .after = DCDate)%>%
  distinct(PIDN, DCDate, .keep_all = TRUE)%>%
  #removing cols that aren't in both
  select(-c(lh_adroi:label), -c(scanner_id:as_lpipeline), -global_cbf_nocerebellum)->
  imaging_pasl




bind_rows(mri_box_data_modified_clean_names$imaging_pc_asl_cbf_pvc_gm_all_production%>%
            filter(label == 'Mean')%>%
            mutate(source = 'new_pcasl')%>%
            rename(DCDate = dc_date,
                   PIDN = pidn),
          mri_box_data_modified_clean_names$imaging_cbf_pvc_gm%>%
            rename(PIDN = pidn, DCDate = dc_date)%>%
            mutate(source = 'new_pcasl')%>%
            filter(label == 'Mean'),
          old_imaging$imaging_pcasl%>%
            clean_names()%>%
            rename_with(~str_remove_all(.x, '_pcasl'))%>%
            mutate(source = 'workbook_pcasl')%>%
            rename(DCDate = dc_date,
                   PIDN = pidn))%>%
  arrange(PIDN, DCDate, factor(source, levels = c('workbook_pcasl', 'new_pcasl')))%>%
  relocate(source, .after = DCDate)%>%
  select(-c(delta_t, tiv, gm, wm, label))%>%
  relocate(PIDN, DCDate, source, scanner_id, source_id, as_lpipeline, proc_status, global_cbf_nocerebellum, sort(colnames(.)))%>%
  rename(asl_pipeline = as_lpipeline)%>%
  distinct(PIDN, DCDate, .keep_all = TRUE)%>%
  select(-c(scanner_id:asl_pipeline), -global_cbf_nocerebellum) ->
  imaging_pcasl



old_imaging$imaging_dti_v2%>%
    rename(date = DCDate)%>%
    clean_names()%>%
    mutate(across(-c(pidn, date, scanner_id, source_id, dt_ipipeline, proc_status_dti2, dtiv2_v2_v2_v1_v1_converted), ~as.numeric(.x)))%>%
    rename_with(~str_replace_all(., "fornix_stria_terminalis", "fornix"))%>%
    mutate(source = 'old_workbook')%>%
    arrange(pidn, date)%>%
    relocate(source, .after = date)%>%
    rename(PIDN = pidn, DCDate = date)%>%
    distinct(PIDN, DCDate, .keep_all = TRUE)->
  dti_v2


old_imaging$imaging_dti_v6%>%
  rename(date = DCDate)%>%
  clean_names()%>%
  mutate(across(-c(pidn, date, scanner_id, source_id, dt_ipipeline, proc_status_dti6), ~as.numeric(.x)))%>%
  rename_with(~str_replace_all(., "fornix_stria_terminalis", "fornix"))%>%
  mutate(source = 'old_workbook')%>%
  rename(PIDN = pidn, DCDate = date)%>%
  distinct(PIDN, DCDate, .keep_all = TRUE)->
  dti_v6
  

#t1 data can be combined. Need guidence on what to do with unmatched cols
new_imaging_gm_all_production_cleaned_renamed <- mri_box_data_modified_clean_names$imaging_gm_all_production%>%
            filter(label == 'Sum')%>%
            mutate(source = 'new_gm_all', 
                   across(c(tiv, gm, wm), ~ .x / 1000000), 
                   across(left_lateral_ventricle:right_unsegmented_white_matter, ~.x / 1000))%>%
            rename(PIDN = pidn, DCDate = dc_date)%>%
            rename_with(~str_remove(., "^ctx_"))

old_imaging_t1_formatted_renamed <-  old_imaging$imaging_t1%>%
            mutate(source = 'old_workbook')%>%
            mutate(across(-c(PIDN:t1pipeline, source), ~ as.double(.x)))%>%
            rename(wm = wmv, 
                   gm = gmv)

bind_rows(new_imaging_gm_all_production_cleaned_renamed,
          old_imaging_t1_formatted_renamed)%>%
  relocate(source, .after = DCDate)%>%
  arrange(PIDN, DCDate)%>%
  select(-label, -delt_t, -delta_t)%>%
  distinct(PIDN, DCDate, factor(source, levels = c('old_workbook', 'new_gm_all')), .keep_all = TRUE)-> t1s_combined

mesoscale

mesoscale <- read_csv(str_c(datafolder,"MSD Mesoscale Updated Dataset 2019/allMesoCombined_4_20.csv"))
mesoscale%>%
  rename(DCDate = meso_specDate)%>%
  mutate(DCDate = as_date(DCDate, format = '%m/%d/%Y'))%>%
  relocate(c(PIDN,DCDate), .before = mesoID) ->
  mesoscale
  
bind_rows(lava$hbmsdinflammation%>%
            select(-instr_type, -v_type, -dc_status, -age_at_dc,-instr_id)%>%
            mutate(across(everything(), function(x) {replace(x, which(x < 0), NA)})),
          mesoscale%>%
            rename_with(~str_remove(.x, "\\.x$"))%>%
            select(-mesoID,-mesoBatch, -mesoBatchDate, -SITE, -ID))%>%
  arrange(PIDN,DCDate, rowSums(is.na(.)))%>%
  distinct(PIDN,DCDate,.keep_all = TRUE)%>%
  mutate(chem_eotaxin_clncv = ifelse(chem_eotaxin_cv > 0.2 & !is.na(chem_eotaxin_cv), NA, chem_eotaxin), 
         chem_eotaxin_3_clncv= ifelse(chem_eotaxin_3_cv > 0.2 & !is.na(chem_eotaxin_3_cv), NA, chem_eotaxin_3),
         cyt_ifn_gamma_clncv = ifelse(cyt_ifn_gamma_cv > 0.2 & !is.na(cyt_ifn_gamma_cv), NA, cyt_ifn_gamma),
         cyt_il_10_clncv = ifelse(cyt_il_10_cv > 0.2 & !is.na(cyt_il_10_cv), NA, cyt_il_10),
         cyt_il_12p70_clncv = ifelse(cyt_il_12p70_cv > 0.2 & !is.na(cyt_il_12p70_cv), NA, cyt_il_12p70),        
         cyt_il_13_clncv= ifelse(cyt_il_13_cv > 0.2 & !is.na(cyt_il_13_cv), NA, cyt_il_13),
         cyt_il_1beta_clncv = ifelse(cyt_il_1beta_cv > 0.2 & !is.na(cyt_il_1beta_cv), NA, cyt_il_1beta),
         cyt_il_2_clncv= ifelse(cyt_il_2_cv > 0.2 & !is.na(cyt_il_2_cv), NA, cyt_il_2),            
         cyt_il_4_clncv = ifelse(cyt_il_4_cv > 0.2 & !is.na(cyt_il_4_cv), NA, cyt_il_4),
         cyt_il_6_clncv = ifelse(cyt_il_6_cv > 0.2 & !is.na(cyt_il_4_cv), NA, cyt_il_6),
         cyt_il_8_clncv = ifelse(cyt_il_8_cv > 0.2 & !is.na(cyt_il_8_cv), NA, cyt_il_8),             
         il_8_p_clncv = ifelse(il_8_p_cv > 0.2 & !is.na(il_8_p_cv), NA, il_8_p),               
         chem_ip_10_clncv = ifelse(chem_ip_10_cv > 0.2 & !is.na(chem_ip_10_cv), NA,chem_ip_10),          
         chem_mcp_1_clncv = ifelse(chem_mcp_1_cv > 0.2 & !is.na(chem_mcp_1_cv), NA, chem_mcp_1),          
         chem_mcp_4_clncv = ifelse(chem_mcp_4_cv > 0.2 & !is.na(chem_mcp_4_cv), NA, chem_mcp_4),          
         chem_mdc_clncv = ifelse(chem_mdc_cv > 0.2 & !is.na(chem_mdc_cv), NA, chem_mdc), 
         chem_mip_1alpha_clncv = ifelse(chem_mip_1alpha_cv > 0.2 & !is.na(chem_mip_1alpha_cv), NA, chem_mip_1alpha),
         chem_mip_1beta_clncv = ifelse(chem_mip_1beta_cv > 0.2 & !is.na(chem_mip_1beta_cv), NA, chem_mip_1beta),      
         chem_tarc_clncv = ifelse(chem_tarc_cv > 0.2 & !is.na(chem_tarc_cv), NA, chem_tarc),
         cyt_tnf_alph_clncv = ifelse(cyt_tnf_alph_cv > 0.2 & !is.na(cyt_tnf_alph_cv), NA, cyt_tnf_alph),     
         ang_bfgf_clncv = ifelse(ang_bfgf_cv > 0.2 & !is.na(ang_bfgf_cv), NA, ang_bfgf),   
         ang_flt_1_clncv = ifelse(ang_flt_1_cv > 0.2 & !is.na(ang_flt_1_cv), NA, ang_flt_1),  
         ang_plgf_clncv = ifelse(ang_plgf_cv > 0.2 & !is.na(ang_plgf_cv), NA, ang_plgf),            
         ang_tie_2_clncv = ifelse(ang_tie_2_cv > 0.2 & !is.na(ang_tie_2_cv), NA, ang_tie_2),       
         ang_vegf_clncv = ifelse(ang_vegf_cv > 0.2 & !is.na(ang_vegf_cv), NA, ang_vegf),          
         ang_vegf_c_clncv = ifelse(ang_vegf_c_cv > 0.2 & !is.na(ang_vegf_c_cv), NA, ang_vegf_c),
         ang_vegf_d_clncv = ifelse(ang_vegf_d_cv > 0.2 & !is.na(ang_vegf_d_cv), NA, ang_vegf_d), 
         vasc_crp_clncv = ifelse(vasc_crp_cv > 0.2 & !is.na(vasc_crp_cv), NA, vasc_crp),            
         vasc_icam_1_clncv = ifelse(vasc_icam_1_cv > 0.2 & !is.na(vasc_icam_1_cv), NA, vasc_icam_1), 
         vasc_saa_clncv = ifelse(vasc_saa_cv > 0.2 & !is.na(vasc_saa_cv), NA, vasc_saa),  
         vasc_vcam_1_clncv = ifelse(vasc_vcam_1_cv > 0.2 & !is.na(vasc_vcam_1_cv), NA, vasc_vcam_1))%>%
  select(PIDN, DCDate, ang_bfgf, ang_bfgf_cv, ang_bfgf_clncv, ang_flt_1, ang_flt_1_cv, ang_flt_1_clncv,
         ang_plgf, ang_plgf_cv, ang_plgf_clncv, ang_tie_2, ang_tie_2_cv, ang_tie_2_clncv,
         ang_vegf, ang_vegf_cv, ang_vegf_clncv, ang_vegf_c, ang_vegf_c_cv, ang_vegf_c_clncv, ang_vegf_cv, ang_vegf_clncv,
         ang_vegf_d, ang_vegf_d_cv, ang_vegf_d_clncv, chem_eotaxin, chem_eotaxin_cv, chem_eotaxin_clncv,
         chem_eotaxin_3, chem_eotaxin_3_cv, chem_eotaxin_3_clncv, chem_ip_10, chem_ip_10_cv, chem_ip_10_clncv,
         chem_mcp_1, chem_mcp_1_cv, chem_mcp_1_clncv, chem_mcp_4, chem_mcp_4_cv, chem_mcp_4_clncv,
         chem_mdc, chem_mdc_cv, chem_mdc_clncv, chem_mip_1alpha, chem_mip_1alpha_cv, chem_mip_1alpha_clncv,
         chem_mip_1beta, chem_mip_1beta_cv, chem_mip_1beta_clncv, chem_tarc, chem_tarc_cv, chem_tarc_clncv,
         cyt_ifn_gamma, cyt_ifn_gamma_cv, cyt_ifn_gamma_clncv, cyt_il_10, cyt_il_10_cv, cyt_il_10_clncv,
         cyt_il_12p70, cyt_il_12p70_cv, cyt_il_12p70_clncv, cyt_il_13, cyt_il_13_cv, cyt_il_13_clncv, cyt_il_1beta,
         cyt_il_1beta_cv, cyt_il_1beta_clncv, cyt_il_2, cyt_il_2_cv, cyt_il_2_clncv, cyt_il_4, cyt_il_4_cv,
         cyt_il_4_clncv, cyt_il_6, cyt_il_6_cv, cyt_il_6_clncv, cyt_il_8, cyt_il_8_cv,
         cyt_il_8_clncv, cyt_tnf_alph, cyt_tnf_alph_cv, cyt_tnf_alph_clncv, il_8_p,
         il_8_p_cv, il_8_p_clncv, mesoPlate, pro_gm_csf, pro_il_1,
         pro_il_12p40, pro_il_15, pro_il_16, pro_il_17, pro_il_5, pro_il_7,
         pro_tnf, pro_vegf, vasc_crp, vasc_crp_cv, vasc_crp_clncv, vasc_icam_1,
         vasc_icam_1_cv, vasc_icam_1_clncv, vasc_saa, vasc_saa_cv, vasc_saa_clncv, vasc_vcam_1,
         vasc_vcam_1_cv, vasc_vcam_1_clncv) ->
  mesoscale

Quanterix

quanterix <- read_excel(str_c(datafolder,"Quanterix/plasma quanterix data_021622.xlsx"))

quanterix_fitbit <- read_excel(str_c(datafolder,'Quanterix/Casaletto_Plasma_Simoa_Quanterix_pTau181_04052023.xlsx'))
quanterix%>%
  relocate(c(PIDN, `Sample Date`), .before = `HB Unqid`)%>%
  mutate(across(c(`plasma ttau`:last_col()), ~as.numeric(.)))%>%
  select(PIDN, `Sample Date`, `plasma ttau`, `plasma ttau_cv`, `plasma ptau181`,
         `plasma ptau181_cv`, `plasma ab40`, `plasma ab40_cv`, `plasma ab42`,
         `plasma ab42_cv`, `plasma gfap`, `plasma gfap_cv`, `plasma nfl`, `plasma nfl_cv`)%>%
  clean_names()%>%
  rename(PIDN = pidn, 
         DCDate = sample_date)%>%
  mutate(DCDate = as_date(DCDate)) ->
  quanterix


quanterix_fitbit%>%
    clean_names()%>%
    rename(PIDN = pidn, 
           DCDate = sample_date)%>%
    mutate(DCDate = as_date(DCDate), 
           analysis_date = as_date(analysis_date),
           across(c(PIDN, sample_id, kit_lot_number, results, sd, cv_percent, run_id, cv), ~ as.numeric(.x)))%>%
    select(PIDN, DCDate, analyte, specimen_type, sample_origin, sample_id, visit_number, kit_lot_number, results, units, sd, cv_percent, run_id, cv, protocol) ->
  quanterix_fitbit

Genetics

list.files(str_c(datafolder,'genetics'), recursive = TRUE, all.files = FALSE, full.names = TRUE)%>%
  str_subset( '[~$]|\\.docx', negate = TRUE) -> genetics_files

genetics_data <- read_all_files(genetics_files, prefix = 'genetics', skip = 0, clean_names = FALSE)
# some columns were repeated. using coalesce to join them together on the same name. 
genetics_data$genetics_master_sequenom_merged_sequenom_allmerged_112922%>%
#remove duplicated name suffx ('...1')
split.default(str_remove(names(.), "\\...+")) %>%
# join columns together and add data if data is missing in first instance of column
map_dfc(~ exec(coalesce, !!!.x))%>%
# rename column that wasn't suffixed 
rename(CPEB3_rs11186856 = CPEB3)%>%
# convert month names to number and reorder to (e.g. jan-3 -> 3/1)-- undoing excels auto date detection
mutate(across(everything(),
              ~ifelse(str_detect(str_remove_all(.x,'[^[[:alpha:]]]'), paste(month.abb, collapse = "|")),
                      paste(match(str_remove_all(.x,'[^[[:alpha:]]]'),month.abb), parse_number(.x), sep = "/"), .x)),
       #convert met to a and val to g 
       COMT_rs4680 = str_replace_all(COMT_rs4680, 'Met', 'A'), 
       COMT_rs4680 = str_replace_all(COMT_rs4680, 'Val', 'G')) ->
  genetics_data$genetics_master_sequenom_merged_sequenom_allmerged_112922 


#joining omni data together and pivoting so colnames match seq data created the letter suffixes to reorder columns alphabetically-not keeping those columns now so is redundant but keep in case we want to reinclude at any point
full_join(
  genetics_data$genetics_genotypes_hillblom2019_genotypes%>%
    mutate(a1a2 = paste(A1,'/',A2,sep = ''), 
           `__b__alt_ref` = paste(Alt, '/', Ref, sep =''))%>%
    pivot_wider(id_cols = 'PIDN',
                names_from = c(`Variable Name for Sequenom Merge`),
                names_glue = '{`Variable Name for Sequenom Merge`}_{.value}',
                values_from = c(a1a2, `GC Score`, Notes, Chr, Start, End, `__b__alt_ref`)),
  genetics_data$genetics_genotypes_hillblom2019_more_genotypes%>%
    mutate(a1a2 = paste(A1,'/',A2,sep = ''),
           `__b__alt_ref` = paste(Alt, '/', Ref, sep =''))%>%
    pivot_wider(id_cols = 'PIDN',
                names_from = c(`Variable Name for Sequenom Merge`),
                names_glue = '{`Variable Name for Sequenom Merge`}_{.value}',
                values_from = c(a1a2, `GC Score`, Notes, Chr, Start, End, `__b__alt_ref`)), by = 'PIDN')%>%
  mutate(across(everything(), ~na_if(., ".")),
         across(everything(), ~na_if(., "./.")))%>%
  rename_with(~ str_replace(.x, 'GC Score', '__a__gc_score'), ends_with('GC Score'))%>%
  rename_with(~ str_replace(.x, 'Chr', '__c__chr'), ends_with('Chr'))%>%
  rename_with(~ str_replace(.x, 'Start', '__d__start'), ends_with('Start'))%>%
  rename_with(~ str_replace(.x, 'End', '__e__end'), ends_with('End'))%>%
  rename_with(~str_replace(.x, .x, str_c(.x,'_omni')), !matches('_a1a2|PIDN'))%>%
  rename_with(~str_remove(.x,'_a1a2'))%>%
  remove_empty(which = c('rows','cols'))%>%
  # reformat any data that was converted to dates in excel 
  mutate(across(everything(),
                ~ifelse(str_detect(str_remove_all(.x,'[^[[:alpha:]]]'), paste(month.abb, collapse = "|")),
                        paste(match(str_remove_all(.x,'[^[[:alpha:]]]'),month.abb), parse_number(.x), sep = "/"), .x)),
         #convert met to a and val to g -- prob not necessary but want to do before join to sequneom data
         COMT_rs4680 = str_replace_all(COMT_rs4680, 'Met', 'A'), 
         COMT_rs4680 = str_replace_all(COMT_rs4680, 'Val', 'G')) ->
  omni_data 


# this joins based on overlap and PIDN, if no match, arranges to prioritize sequenom data, and, selects only columns that contain overlap name, drops empty gene data, and selects only one row-- this is to ensure we have all data that is available for each gene and that sequenom data isnt paired with mismatching omni metadata
# cleaning sequenom data before joining

# find columns that are present in both omni data and sequenom data
overlapping_genes <- intersect(names(omni_data%>%select(-PIDN)),
                               names(genetics_data$genetics_master_sequenom_merged_sequenom_allmerged_11292))
#find only genes in sequenom data
sequenom_only <- setdiff(names(genetics_data$genetics_master_sequenom_merged_sequenom_allmerged_11292),
                         names(omni_data%>%select(-PIDN)))

#create list of dfs from omni data - one for each gene and the data there within
overlapping_genes_dfs <- lapply(overlapping_genes,
                                function(x) omni_data%>%
                                  select(PIDN, contains(x)))%>%
  set_names(overlapping_genes)

#join list of omni gene dfs to sequenom data 
list_of_joined_genetics_dfs<-map2(overlapping_genes_dfs, overlapping_genes, function(gene_df, gene_name) {
    full_join(genetics_data$genetics_master_sequenom_merged_sequenom_allmerged_112922%>%
                  #indicate where data came from 
                  mutate(sequenom = 'sequenom'),
              gene_df%>%mutate(omni = 'omni'))%>%
        arrange(PIDN, rowSums(is.na(.)))%>%
        select(PIDN, contains(gene_name), sequenom, omni)%>%
        group_by(PIDN)%>%
        drop_na(gene_name)%>%
      # create source column and concat strings from sequnom and omni columns unless theres an empty gc_score column where both omni and seq data exist
        mutate(source = ifelse(str_c(gene_name,'___a__gc_score_omni') %in% colnames(.) &&
                                   is.na(get(str_c(gene_name,'___a__gc_score_omni'))) &&
                                   any(!is.na(sequenom)) &&
                                   any(!is.na(omni)),
                               'sequenom',
                               str_c(sequenom[!is.na(sequenom)], omni[!is.na(omni)], sep = ' + ')))%>%
        select(PIDN, source, gene_name, contains('gc_score'))%>%
        rename_with(~str_replace(., 'source', str_c(gene_name, '_source')))%>%
        rename_with(~str_remove_all(.x, '__[:alnum:]__'), matches('_[[:alnum:]]_'))%>%
        rename_with(~str_remove_all(.x, '_omni'))%>%
        distinct(PIDN, .keep_all = TRUE)}) 

#final genetic data to use 
list_of_joined_genetics_dfs%>%
  reduce(., coalesce_join, by = 'PIDN')%>%
  coalesce_join(genetics_data$genetics_master_sequenom_merged_sequenom_allmerged_11292%>%
                  #create source column for those columns not in omni data
                  mutate(across(sequenom_only, .names = '{.col}_source', ~ paste('sequenom'))), by = 'PIDN')%>%
  relocate(sort(names(.)))%>%
  relocate(PIDN, matches(match = 'apoe', ignore.case = TRUE),
           matches(match = 'bdnf', ignore.case = TRUE),
           matches(match = 'comt', ignore.case = TRUE),
           matches(match = 'DRD4', ignore.case = TRUE),
           matches(match = 'SNAP25', ignore.case = TRUE))%>%
  mutate(across(contains('gc_score'), ~as.numeric(.)))%>%
  # removing gc_score columns from genetics data
  select(-ends_with('gc_score'))->
  genetics_data

Gait

gait <- read_excel(str_c(datafolder,'gait/Gait + SPPB.xlsx'))
gait%>%
  clean_names()%>%
  rename(DCDate = neuroexam_date, 
         PIDN = pidn)%>%
  mutate(DCDate = as_date(DCDate), 
         across(c(gait_version, gait_distance, leisure_t1, leisure_t2, leisure_t3, gait_leisure_avg, speed_t1, speed_t2, speed_t3, gait_speed_avg),
                ~ as.numeric(.x)))%>%
  relocate(DCDate, .after = PIDN)%>%
  select(PIDN, DCDate, project, gait_version, gait_distance, leisure_t1, leisure_t2, leisure_t3, gait_leisure_avg, speed_t1, speed_t2, speed_t3, gait_speed_avg) -> 
  gait

External Team Data

Image

PET DATA

pet_data <- read_excel(str_c(datafolder,"pet/Hillblom_AmyloidPET_Final_11-3-22_CY.xlsx"))
#ignore data from lava-PET core cannot validate it. Ignore previous dataset pet data as well. Only use data provided most recently by PETcore team. The thresholds were given by Renaud on 2022-12-09.

# AV45: 1.11 SUVR (Landau et al, JNM 2013; https://pubmed.ncbi.nlm.nih.gov/23166389/)
# PIB: 1.21 SUVR (Villeneuve et al Brain 2015; https://pubmed.ncbi.nlm.nih.gov/25953778/)
# FBB: 1.08 SUVR (Royse et al, ART 2021 https://pubmed.ncbi.nlm.nih.gov/33971965/)
pet_data%>%
  rename(DCDate = PETDate)%>%
  mutate(PETsuvr = as.numeric(PETsuvr),
         PETcentiloids = as.numeric(PETcentiloids),
         DCDate = as_date(DCDate),
         PET_suvr_threshold = ifelse(PETcompound == 'AV45', 1.11, 
                            ifelse(PETcompound == 'PIB', 1.21,
                                   ifelse(PETcompound == 'FBB', 1.08, NA))),
         PET_suvr_threshold_positive = as.numeric(PETsuvr > PET_suvr_threshold)) ->
  pet

Sleep team data

sleep_data <- read_all_files(str_c(datafolder,"sleep/sleep_team_data.xlsx"), sleep, 0, clean_names = TRUE)

sleep_profiler_data <- read_excel(str_c(datafolder,"sleep/sleep_study_metrics_April_2023.xlsx"))

Overlapping sleep instruments

sleep_data$sleep_questionnaire_data_sleep_team_data <- sleep_data$sleep_questionnaire_data_sleep_team_data%>%
  rename(DCDate = sleep_study_date,
         Berlin_apneaRisk = berlin_risk, 
         ISItotal = isi, 
         psqi_Durat = final_sleep_duration_score,
         psqi_Disturb = final_sleep_disturbance_score, 
         psqi_Laten = final_sleep_latency_score,
         psqi_daydys = final_daytime_dysfunction_score, 
         psqi_slpqual = psqi_slpqual,
         psqi_meds = psqi_meds,
         psqi_hse = final_hse_score,
         psqi_PSQItot = global_psqi_score,
         ESStotal = ess_total)%>%
  mutate(ISIdx = ifelse(ISItotal <= 7, "no clinical insomnia", 
                        ifelse((ISItotal >= 8) & (ISItotal <= 14), "subthreshold insomnia",
                               ifelse((ISItotal >= 15) & (ISItotal <= 21), "clinical insomnia (moderate)",
                                      ifelse((ISItotal >= 22) & (ISItotal <= 28), "clinical insomnia", NA)))),
        psqi_totBinary = ifelse(psqi_PSQItot >= 6, 1, 0),
        DCDate = as_date(DCDate))%>%
  #filtering out data with less than 75% of data (NAs or all 0s). see as_date('2022-03-12') & PIDN == 23898
  filter(rowSums(is.na(.) | . == 0) < .75 * ncol(.))

Sleep Instruments

  • These are not in LAVA or Qualtrics data
sleep_data$sleep_questionnaire_data_sleep_team_data%>%
  select(PIDN, DCDate, bmi, cti_lv, cti_fr, sps_cbc, sps_ac, sps_total,
         fosq_general_productivity, fosq_social_outcome, fosq_activity_level, 
         fosq_vigilance, fosq_intimate_rels_sexual_activity, fosq_total, meq_morn_total, meq_score) ->
  sleep_instruments

Sleep Profiler

# not using the averages at the moment-these data are also outdated and have been replaced with updated sleep profiler data from sleep team. Can use the code to conver new data to averages if requested

# sleep_data$sleep_sleep_profiler_data_sleep_team_data <- sleep_data$sleep_sleep_profiler_data_sleep_team_data%>%
#     group_by(PIDN)%>%
#     fill(sleep_study_date, .direction = 'down')%>%
#     rename(DCDate = sleep_study_date)%>%
#     group_by(PIDN, DCDate)%>%
#     mutate(DCDate = as_date(DCDate))%>%
#     mutate(across(c(sleep_latency_min, total_waso_min, actual_sleep_time_min, sleep_efficiency_ast_time_in_bed, n1_mins, n2_mins, n3_mins, rem_mins),
#                   ~mean(.x),.names = '{.col}_avg'),
#            across(c(sleep_latency_min, total_waso_min, actual_sleep_time_min, sleep_efficiency_ast_time_in_bed, n1_mins, n2_mins, n3_mins, rem_mins),
#                   ~sd(.x),.names = '{.col}_sd'),
#            across(c(sleep_latency_min, total_waso_min, actual_sleep_time_min, sleep_efficiency_ast_time_in_bed, n1_mins, n2_mins, n3_mins, rem_mins),
#                   ~max(.x),.names = '{.col}_max'),
#            across(c(sleep_latency_min, total_waso_min, actual_sleep_time_min, sleep_efficiency_ast_time_in_bed, n1_mins, n2_mins, n3_mins, rem_mins),
#                   ~min(.x),.names = '{.col}_min'),
#            num_nights = max(night))%>%
#   select(PIDN, DCDate, num_nights, ends_with('avg'), ends_with('min'), ends_with('max'), ends_with('sd'))%>%
#   relocate(PIDN, DCDate, num_nights, sort(names(.)))%>%
#   distinct(PIDN, DCDate, .keep_all = TRUE)%>%
#   select(-actual_sleep_time_min, -total_waso_min, -sleep_latency_min) ->
#   sleep_profiler


sleep_profiler_data%>%
    rename(DCDate = StudyDate)%>%
    mutate(DCDate = as_date(DCDate))%>%
    arrange(PIDN, DCDate) %>%
    group_by(PIDN, grp = cumsum(c(TRUE, diff(Night) < 0)))%>%
    mutate(startdate = min(DCDate), enddate = max(DCDate))%>%
    relocate(startdate, enddate, .after = DCDate)%>%
    pivot_wider(id_cols = c('PIDN', 'startdate', 'enddate'), 
                values_from = -c('PIDN', 'DCDate', 'Sleep_Profiler_Device_SN', 'startdate', 'enddate', 'Night', 'grp'),
                names_from = 'Night',
                names_glue = '{.value}_night_{Night}') -> 
  sleep_profiler

Sleep Apnea

sleep_data$sleep_apnea_data_sleep_team_data%>%
  rename(DCDate = apnealink_recordingdate)%>%
  mutate(DCDate = as_date(DCDate)) ->
  apnea

Epworth sleepiness

  • Combining sleep team data and other source data
qualtrics_data$qualtrics_epworth_sleepiness_scale%>%
  bind_rows(sleep_data$sleep_questionnaire_data_sleep_team_data%>%
              select(PIDN, DCDate, ESStotal)%>%
              mutate(source = 'sleep_team')%>%
              filter(!is.na(ESStotal)))%>%
  group_by(PIDN, DCDate)%>%
  arrange(PIDN, DCDate, factor(source, levels = c('qualtrics', 'sleep_team', 'prev_dataset')))%>%
  distinct(PIDN, DCDate, .keep_all = TRUE)%>%
  rename(age = what_is_your_age_in_years,
      sitting_and_reading = how_likely_are_you_to_doze_off_or_fall_asleep_in_the_following_situations_in_contrast_to_feeling_just_tired_sitting_and_reading,
      watching_tv = how_likely_are_you_to_doze_off_or_fall_asleep_in_the_following_situations_in_contrast_to_feeling_just_tired_watching_tv, 
      inactive_in_a_public_place = 
        how_likely_are_you_to_doze_off_or_fall_asleep_in_the_following_situations_in_contrast_to_feeling_just_tired_sitting_inactive_in_a_public_place_e_g_a_theater_or_a_meeting, 
      passenger_in_car = 
        how_likely_are_you_to_doze_off_or_fall_asleep_in_the_following_situations_in_contrast_to_feeling_just_tired_as_a_passenger_in_a_car_for_an_hour_without_a_break, 
      rest_in_afternoon = 
        how_likely_are_you_to_doze_off_or_fall_asleep_in_the_following_situations_in_contrast_to_feeling_just_tired_lying_down_to_rest_in_the_afternoon_when_circumstances_permit,
      sitting_and_talking = how_likely_are_you_to_doze_off_or_fall_asleep_in_the_following_situations_in_contrast_to_feeling_just_tired_sitting_and_talking_to_someone, 
      after_lunch = how_likely_are_you_to_doze_off_or_fall_asleep_in_the_following_situations_in_contrast_to_feeling_just_tired_sitting_quietly_after_a_lunch_without_alcohol,
      in_car_while_stopped = 
        how_likely_are_you_to_doze_off_or_fall_asleep_in_the_following_situations_in_contrast_to_feeling_just_tired_in_a_car_while_stopped_for_a_few_minutes_in_the_traffic,
      sum_high_chance_of_dozing = high, 
      sum_mod_chance_of_dozing = moderate, 
      sum_slight_chance_of_dozing = slight, 
      sum_would_never_doze = no)%>%
  select(-what_is_your_sex, -age)%>%
  prioritize_qualtrics_data() ->
  epworth_sleepiness 

Insomnia Severity Index (ISI)

  • Combining sleep team data and other source data
qualtrics_data$qualtrics_insomnia_severity_index_isi%>%
  bind_rows(sleep_data$sleep_questionnaire_data_sleep_team_data%>%
              select(PIDN, DCDate, ISIdx, ISItotal)%>%
              filter(!is.na(ISItotal))%>%
              mutate(source = 'sleep_team'))%>%
  group_by(PIDN, DCDate)%>%
  arrange(PIDN, DCDate, factor(source, levels = c('qualtrics','sleep_team', 'prev_dataset')))%>%
  distinct(PIDN, DCDate, .keep_all = TRUE)%>%
  prioritize_qualtrics_data() ->
  insomnia_severity_index

Pittsburgh Sleep Quality Index (PSQI)

  • Combining sleep team data and other source data
qualtrics_data$qualtrics_pittsburgh_sleep_quality_index%>%
    mutate(instr_type = 'PSQI')%>%
    select(PIDN, DCDate, source, instr_type, psqi_PSQItot, psqi_totBinary)%>%
  bind_rows(sleep_data$sleep_questionnaire_data_sleep_team_data%>%
              select(PIDN, DCDate, psqi_PSQItot, psqi_totBinary)%>%
              mutate(source = 'sleep_team', 
                     instr_type = 'PSQI')%>%
              filter(!is.na(psqi_PSQItot)))%>%
  group_by(PIDN, DCDate)%>%
  arrange(PIDN, DCDate, factor(source, levels = c('qualtrics', 'sleep_team', 'prev_dataset')))%>%
  distinct(PIDN, DCDate, .keep_all = TRUE)%>%
  prioritize_qualtrics_data() ->
  psqi

Berlin Sleep

  • Combining sleep team data and other source data
qualtrics_data$qualtrics_berlin_sleep_questionnaire%>%
  bind_rows(sleep_data$sleep_questionnaire_data_sleep_team_data%>%
                  select(PIDN, DCDate, Berlin_apneaRisk)%>%
                  mutate(source = 'sleep_team')%>%
                  filter(!is.na(Berlin_apneaRisk)))%>%
    group_by(PIDN, DCDate)%>%
    arrange(PIDN, DCDate, factor(source, levels = c('qualtrics', 'sleep_team', 'prev_dataset')))%>%
    mutate(Berlin_apneaRisk = ifelse(Berlin_apneaRisk == 'HighRisk', 'high risk',
                                     ifelse(Berlin_apneaRisk == 'LowRisk', 'low risk', tolower(Berlin_apneaRisk))))%>%
    distinct(PIDN, DCDate, .keep_all = TRUE)%>%
  prioritize_qualtrics_data()->
  berlin_sleep

Previous Dataset Data

  • Data that is only available in previous dataset

Synaptic Markers

  • don’t have original source
old_dataset <- read_csv(str_c(datafolder,'old_dataset/HBSpring2020_2020-06-18.T1.WMH.v2.v6.SNI.Repro.moregene.meso.champscomp.newPET.newFitbit.PRSupdated.csv'))
old_dataset%>%
  select(PIDN, csfdrawDate, csfSpecID:GAP43..156.10000.pg.ml..CSF)%>%
  mutate(csfdrawDate = as_date(csfdrawDate, format = '%m/%d/%Y'))%>%
  rename(DCDate = csfdrawDate)%>%
  filter(!is.na(DCDate) & !is.na(csfzetterbergSpecID))%>%
  select(PIDN, DCDate, csfSpecID, AB1.40..pg.mL., AB1.42..pg.mL.,
         pTau...pg.mL., tTau...pg.mL., NG36..pg.mL., SYT1..pM.,
         SNAP25long..pM.,SNAP25tot..pM.,GAP43..156.10000.pg.ml..CSF) ->
  csf

Bedside Composites

Joining bedside data

  • Joining together bedsidescreen, bedside alternates, bedside with old vars, neuropsych bedside, neuropsych cvlt, and demographics
lava$bedsidealternates<-lava$bedsidealternates%>%mutate(instr_id_alternates = instr_id)%>%relocate(instr_id_alternates, .after = PIDN)
lava$bedsidescreen_witholdvars<-lava$bedsidescreen_witholdvars%>%mutate(instr_id_old_vars = instr_id)%>%relocate(instr_id_old_vars, .after = PIDN)
lava$neuropsychbedside<-lava$neuropsychbedside%>%mutate(instr_id_np_bedside = instr_id_11)%>%relocate(instr_id_np_bedside, .after = DCDate)
lava$neuropsychcvlt<-lava$neuropsychcvlt%>%mutate(instr_id_np_cvlt_bedside = instr_id)%>%relocate(instr_id_np_cvlt_bedside, .after = PIDN)


# keeps only the vars hillblom wants, joins all sources from lava, and creates wrat_baseline and wrat_baseline_date var. Replaces negative values with NAs. 
    
lava$bedsidescreen%>%
    left_join(lava$bedsidealternates)%>%
    left_join(lava$bedsidescreen_witholdvars)%>%
    left_join(lava$neuropsychbedside)%>%
    left_join(lava$neuropsychcvlt)%>%
    slice(0)%>%
    bind_rows(lava$neuropsychbedside%>%
                  mutate(source='np_bs'),
              lava$bedsidescreen%>%
                  mutate(source='bs'),
              lava$bedsidealternates%>%
                  mutate(source = 'bsa'),
              lava$neuropsychcvlt%>%
                  mutate(source = 'np_cvlt'),
              lava$bedsidescreen_witholdvars%>%
                  mutate(source='bsov'))%>%
    group_by(PIDN, DCDate)%>%
  #sets instr_id as instr_id_11 or instr_id_141 if instr_id is empty or doesn't exist 
    mutate(instr_id = ifelse(is.na(instr_id), instr_id_11, ifelse(is.na(instr_id) & is.na(instr_id_11), instr_id_141, instr_id)))%>%
    mutate(across(1:instr_id_np_cvlt_bedside, function(x) {replace(x, which(x<0), NA)}))%>%
    select(PIDN:instr_id, source, mmse_tot:mod_rey, digit_fw, digit_bw, wrat_tot, d_corr:rey_recg, bnt_corr:numb_loc, gds1:gds15to,
           instr_id_alternates, instr_type_2, cv2form:cv2rd, mod_rey_b:rey_b_recg)%>%
  #arranges by row with most data, fills missing values if they exist on a row with same date
    arrange(PIDN, DCDate, rowSums(is.na(.)))%>%
    fill(everything(), .direction = "up")%>%
  #creates cols to identify where data came from if filled form other row
    mutate(other_instr_ids = str_c(unique(c(paste(instr_id))), collapse = ", "))%>%
    mutate(other_instr_types = str_c(unique(c(instr_type)), collapse = ", "))%>%
    relocate(source, other_instr_ids, other_instr_types, .after = instr_id)%>%
  #creating wrat cols to join by pidn
    left_join(lava$bedsidescreen%>%
                  select(PIDN,DCDate,wrat_tot)%>%
                  group_by(PIDN)%>%
                  filter(!is.na(wrat_tot) & wrat_tot >=0)%>%
                  arrange(PIDN,DCDate)%>%slice(1)%>%
                  rename(wrat_baseline_date = DCDate, wrat_baseline = wrat_tot), by = 'PIDN')%>%
    mutate(wrat_on_or_before_current_date = wrat_baseline_date <= DCDate)%>%
    relocate(wrat_baseline_date, wrat_baseline, wrat_on_or_before_current_date, .before = wrat_tot)%>%
    distinct(PIDN,DCDate, .keep_all = TRUE)%>%
  select(-age_at_dc) ->
  bedside

Composite calculations

  • Taken from existing code
## Modified Trails
bedside$mt_ratio <- (60*bedside$mt_corr)/bedside$mt_time 
bedside$mt_ln <- log(bedside$mt_ratio+1)

## CVLT Long
bedside$cv2fptot <- bedside$cv2b_r + bedside$cv2b_u + bedside$cv2np + bedside$cv2nu
bedside$cv2phit <- ifelse(bedside$cv2hit == 16, 15.5/16,
                                         #error found here. ifelse statement below had cv2fptot in place of where cv2hit should've been,
                                         #creating differences in this data compared to the previous dataset 
                          ifelse(bedside$cv2hit==0, 0.5/16, bedside$cv2hit/16))
bedside$cv2pfp <- ifelse(bedside$cv2fptot==32,31.5/32,
                          ifelse(bedside$cv2fptot==0,0.5/32,bedside$cv2fptot/32))

bedside$cv2zhit = qnorm(bedside$cv2phit)
bedside$cv2zfp = qnorm(bedside$cv2pfp)
bedside$cv2dprime=bedside$cv2zhit-bedside$cv2zfp
bedside$cv2bias=-.5*(bedside$cv2zhit+bedside$cv2zfp)

## 1 Back
lava$`1back`$nb1Hits <- ifelse(lava$`1back`$nb1c > 5 & lava$`1back`$nb1smc > 0 ,
                                         (lava$`1back`$nb1smc+0.5)/11,NA)
lava$`1back`$nb1TotalNo <- lava$`1back`$nb1s1c + lava$`1back`$nb1s2c + lava$`1back`$nb1s3c + lava$`1back`$nb1s4c
lava$`1back`$nb1FalseAlarms<-(20 - lava$`1back`$nb1TotalNo + 0.5) / 21
lava$`1back`$nb1ZFA <- qnorm(lava$`1back`$nb1FalseAlarms)
lava$`1back`$nb1ZHIT <- qnorm(lava$`1back`$nb1Hits)
lava$`1back`$nb1dprime <- lava$`1back`$nb1ZHIT-lava$`1back`$nb1ZFA

## 2 Back
lava$`2back`$nb2Hits <- ifelse(lava$`2back`$nb2c > 10 & lava$`2back`$nb2smc > 0 ,
                                (lava$`2back`$nb2smc + 0.5)/31,NA)
lava$`2back`$nb2TotalNo <- lava$`2back`$nb2s1c + lava$`2back`$nb2s2c + lava$`2back`$nb2s3c + lava$`2back`$nb2s4c
lava$`2back`$nb2FalseAlarms<-(60 - lava$`2back`$nb2TotalNo + 0.5) / 61
lava$`2back`$nb2ZFA <- qnorm(lava$`2back`$nb2FalseAlarms)
lava$`2back`$nb2ZHIT <- qnorm(lava$`2back`$nb2Hits)
lava$`2back`$nb2dprime <- lava$`2back`$nb2ZHIT-lava$`2back`$nb2ZFA
lava$`2back`$nb2bias <- (lava$`2back`$nb2ZHIT + lava$`2back`$nb2ZFA)/2

## Flanker eprime
lava$enclosedflanker$flkincacc <- lava$enclosedflanker$ef_inc_cnt/40
lava$enclosedflanker$flkincaccscore <- 5*lava$enclosedflanker$flkincacc
lava$enclosedflanker$flkinclog <- log10(lava$enclosedflanker$ef_inc_med)
lava$enclosedflanker$flkincstem <- (lava$enclosedflanker$flkinclog-log10(400))/(log10(800)-log10(400))
lava$enclosedflanker$flkincrtscore <- 5-(5*lava$enclosedflanker$flkincstem)
lava$enclosedflanker$flankerinc <- ifelse(lava$enclosedflanker$ef_inc_cnt > 10,lava$enclosedflanker$flkincaccscore+lava$enclosedflanker$flkincrtscore,NA)

## SetShifting eprime
lava$setshifting$shiftacc <- lava$setshifting$all_shft_c_shift_corr / (lava$setshifting$all_shft_c_shift_corr + lava$setshifting$all_shft_e_shift_errors)
lava$setshifting$shiftaccscore <- 5*lava$setshifting$shiftacc
lava$setshifting$all_shft_m_shift_median[lava$setshifting$all_shft_m_shift_median<400] <- 400
lava$setshifting$all_shft_m_shift_median[lava$setshifting$all_shft_m_shift_median>2800] <- 2800
lava$setshifting$shftlog <- log10(lava$setshifting$all_shft_m_shift_median)
lava$setshifting$shftstem <- (lava$setshifting$shftlog - log10(450))/(log10(1600) - log10(450))
lava$setshifting$shiftrtscore <- 5-(5*lava$setshifting$shftstem)
lava$setshifting$shiftscore <- lava$setshifting$shiftaccscore + lava$setshifting$shiftrtscore

## Processing Speed
lava$infoprocessingspeed$animyesz <- ifelse(lava$infoprocessingspeed$anim_yes_acc >= 70,(lava$infoprocessingspeed$anim_yes_med - 595.01)/ 83.42073526,NA)
lava$infoprocessingspeed$animnoz <- ifelse(lava$infoprocessingspeed$anim_no_acc >= 70,(lava$infoprocessingspeed$anim_no_med - 586.48) / 78.34211216,NA)
lava$infoprocessingspeed$line10z <- ifelse(lava$infoprocessingspeed$lines_10_acc >= 70,(lava$infoprocessingspeed$lines_10_med - 606.74) /129.6893719,NA)
lava$infoprocessingspeed$line20z <- ifelse(lava$infoprocessingspeed$lines_20_acc >= 70,(lava$infoprocessingspeed$lines_20_med - 545.98) / 111.1612913,NA)
lava$infoprocessingspeed$rhymeyesz <- ifelse(lava$infoprocessingspeed$rhyme_yes_acc >= 70,(lava$infoprocessingspeed$rhyme_yes_med - 1300.34) / 252.0888475,NA)
lava$infoprocessingspeed$rhymenoz <- ifelse(lava$infoprocessingspeed$rhyme_no_acc >= 70,(lava$infoprocessingspeed$rhyme_no_med - 1247.78) / 215.990472,NA)
#added rotate to existing composite scripts
lava$infoprocessingspeed$rotate60z <- ifelse(lava$infoprocessingspeed$rotate_60_acc >= 70, (lava$infoprocessingspeed$rotate_60_med - 1531.53) / 482.8357335, NA)
lava$infoprocessingspeed$rotate120z <- ifelse(lava$infoprocessingspeed$rotate_120_acc >= 70, (lava$infoprocessingspeed$rotate_120_med - 1922.08) / 680.7012085, NA)
lava$infoprocessingspeed$search16nz <- ifelse (lava$infoprocessingspeed$search_16n_acc >= 70,(lava$infoprocessingspeed$search_16n_med - 1189.21) /357.3644424,NA)
lava$infoprocessingspeed$search16yz <- ifelse (lava$infoprocessingspeed$search_16y_acc >= 70,(lava$infoprocessingspeed$search_16y_med - 793.00) /124.7270867,NA)
lava$infoprocessingspeed$search24nz <- ifelse (lava$infoprocessingspeed$search_24n_acc >= 70,(lava$infoprocessingspeed$search_24n_med - 1465.84) /461.1421215,NA)
lava$infoprocessingspeed$search24yz <- ifelse (lava$infoprocessingspeed$search_24y_acc >= 70,(lava$infoprocessingspeed$search_24y_med - 855.75) /147.8585169,NA)
lava$infoprocessingspeed$wordyesz <- ifelse(lava$infoprocessingspeed$word_yes_acc >= 70,(lava$infoprocessingspeed$word_yes_med - 633.19) / 101.3623032,NA)
lava$infoprocessingspeed$wordnoz <- ifelse(lava$infoprocessingspeed$word_no_acc > 70,(lava$infoprocessingspeed$word_no_med - 622.24) /95.85706423,NA)
lava$infoprocessingspeed$match21z <- ifelse(lava$infoprocessingspeed$match2_l1_acc >= 70,(lava$infoprocessingspeed$match2_l1_med - 1790.48) /597.9149021,NA)
lava$infoprocessingspeed$match22z <- ifelse(lava$infoprocessingspeed$match2_l2_acc >= 70,(lava$infoprocessingspeed$match2_l2_med - 2165.30) / 746.3290314,NA)
lava$infoprocessingspeed$match23z <- ifelse(lava$infoprocessingspeed$match2_l3_acc >= 70,(lava$infoprocessingspeed$match2_l3_med - 1974.99) / 602.7246202,NA)

#added this--not in previous script but was sent by Joel
lava$infoprocessingspeed$rotatez= .5 * (lava$infoprocessingspeed$rotate60z + lava$infoprocessingspeed$rotate120z)

v <-c("pronz","animz","rhymez","wordz")
lava$infoprocessingspeed$pronz <- ifelse(lava$infoprocessingspeed$pron_acc >= 70,(lava$infoprocessingspeed$pron_med - 1954.01) /605.5701116,NA)
lava$infoprocessingspeed$animz <- 0.5*(lava$infoprocessingspeed$animyesz + lava$infoprocessingspeed$animnoz)
lava$infoprocessingspeed$rhymez <- 0.5*(lava$infoprocessingspeed$rhymeyesz + lava$infoprocessingspeed$rhymenoz)
lava$infoprocessingspeed$wordz <- 0.5*(lava$infoprocessingspeed$wordyesz + lava$infoprocessingspeed$wordnoz)
lava$infoprocessingspeed$nmverbal <- apply(!is.na(lava$infoprocessingspeed[,v]), 1, sum)
  
s <- c("dotz","match1z","shapez","match2z","linez","searchz")
lava$infoprocessingspeed$dotz <- ifelse(lava$infoprocessingspeed$dot_acc >= 70,(lava$infoprocessingspeed$dot_med - 613.73) /129.6581279,NA)
lava$infoprocessingspeed$match1z <- ifelse(lava$infoprocessingspeed$match_acc >= 70,(lava$infoprocessingspeed$match_med - 1193.08) / 295.5833986,NA)
lava$infoprocessingspeed$shapez <- ifelse(lava$infoprocessingspeed$shape_acc >= 70,(lava$infoprocessingspeed$shape_med - 697.95) / 129.4176206,NA)
lava$infoprocessingspeed$match2z <- 0.33*(lava$infoprocessingspeed$match21z + lava$infoprocessingspeed$match22z + lava$infoprocessingspeed$match23z)
lava$infoprocessingspeed$linez <- 0.5*(lava$infoprocessingspeed$line10z + lava$infoprocessingspeed$line20z)
lava$infoprocessingspeed$searchz=0.25* (lava$infoprocessingspeed$search16nz + lava$infoprocessingspeed$search16yz + lava$infoprocessingspeed$search24nz + lava$infoprocessingspeed$search24yz)
lava$infoprocessingspeed$nmspatial <- apply(!is.na(lava$infoprocessingspeed[,s]), 1, sum)

## Verbal composite
lava$infoprocessingspeed$vsumz <- ifelse(lava$infoprocessingspeed$nmverbal>=3,rowSums(lava$infoprocessingspeed[,v], na.rm=TRUE),NA)
lava$infoprocessingspeed$vngood <- as.numeric(lava$infoprocessingspeed$nmverbal)
lava$infoprocessingspeed$verbal = lava$infoprocessingspeed$vsumz/lava$infoprocessingspeed$vngood

## Spatial composite
lava$infoprocessingspeed$ssumz <- ifelse(lava$infoprocessingspeed$nmspatial>=5,rowSums(lava$infoprocessingspeed[,s], na.rm=TRUE),NA)
lava$infoprocessingspeed$sngood <- as.numeric(lava$infoprocessingspeed$nmspatial)
lava$infoprocessingspeed$spatial = lava$infoprocessingspeed$ssumz/lava$infoprocessingspeed$sngood

## Memory score
bedside$immrecall <- bedside$cv2t1c + bedside$cv2t2c + bedside$cv2t3c + bedside$cv2t4c + bedside$cv2t5c
bedside$bensonrecall <-ifelse(!is.na(bedside$rey10m), bedside$rey10m, bedside$rey_b10m)

m <-c("immrecallz","cv2lfrcz","cv2dprimez","bensonrecallz")
bedside$immrecallz <- (bedside$immrecall - 50.8277) /10.45484
bedside$cv2lfrcz <- (bedside$cv2lfrc- 11.55) /3.066
bedside$cv2dprimez=(bedside$cv2dprime - 3.1658) /.72185
bedside$bensonrecallz=(bedside$bensonrecall - 11.1576) /3.14529
bedside$nmmemory <- apply(!is.na(bedside[,m]), 1, sum)
  
## Memory composite
bedside$msumz <- ifelse(bedside$nmmemory>=3,rowSums(bedside[,m], na.rm=TRUE),NA)
bedside$mngood <- as.numeric(bedside$nmmemory)
bedside$memoryzscore=bedside$msumz /bedside$mngood

## Bedside Executive score
e <- c("DigitBWz", "StrpCorz", "MTTimez", "dcorrz", "DFCorrz")
bedside$DigitBWz <- (bedside$digit_bw - 5.34) /1.262
bedside$StrpCorz <- (bedside$strp_cor - 48.53) /10.991
bedside$MTTimez <- (32.18 - bedside$mt_time) /16.618
bedside$dcorrz <- (bedside$d_corr - 15.7128) /4.55404
bedside$DFCorrz <- (bedside$df_corr - 10.51) /3.195
bedside$nmbsex <- apply(!is.na(bedside[,e]), 1, sum)

## Bedside Executive composite
bedside$esumz <- ifelse(bedside$nmbsex>=3,rowSums(bedside[,e], na.rm=TRUE),NA)
bedside$engood <- as.numeric(bedside$nmbsex)
bedside$bsexzscore <- bedside$esumz /bedside$engood

## round all vars to two decimal places for bedside
bedside%>%
  select(-engood, -nmmemory, -nmbsex,-mngood, -msumz, -esumz)%>%
  mutate(across(c(immrecall, bensonrecall, immrecallz,
  cv2fptot, cv2phit, cv2pfp, cv2zhit, cv2zfp, cv2dprime, cv2bias, cv2dprimez,
  bensonrecallz, memoryzscore, DigitBWz, StrpCorz, MTTimez, dcorrz, DFCorrz, 
  bsexzscore, mt_ratio, mt_ln), janitor::round_half_up, 3)) ->
  bedside

## round all vars to two decimal places for infoprocspeed
lava$infoprocessingspeed%>%
  select(-nmverbal, -nmspatial, -vngood, -sngood, -vsumz, -ssumz)%>%
  mutate(across(c(animyesz, animnoz, line10z, line20z, rhymeyesz, rhymenoz, rotate60z, rotate120z, search16nz, search16yz, search24nz,
                search24yz, wordyesz, wordnoz, match21z, match22z, match23z, rotatez, pronz, animz, rhymez, wordz, dotz, 
                match1z, shapez, match2z, linez, searchz, verbal, spatial), janitor::round_half_up, 3))%>%
  select(PIDN, DCDate, instr_type, task, version, animyesz, animnoz, line10z,
         line20z, rhymeyesz, rhymenoz, rotate60z, rotate120z, search16nz, search16yz,
         search24nz, search24yz, wordyesz, wordnoz, match21z, match22z, match23z,
         rotatez, pronz, animz, rhymez, wordz, dotz, match1z, shapez, match2z, linez,
         searchz, verbal, spatial) ->
  infoprocessingspeed

## round all vars to two decimal places for 1 Back
lava$`1back`%>%
  mutate(across(c(nb1Hits, nb1TotalNo, nb1FalseAlarms, nb1ZFA, nb1ZHIT, nb1dprime), janitor::round_half_up, 3))%>%
  select(PIDN, DCDate, instr_type, task, version, nb1Hits, nb1TotalNo, nb1FalseAlarms, nb1ZFA, nb1ZHIT, nb1dprime) ->
  `1back`

## round all vars to two decimal places for 2 Back
lava$`2back`%>%
  mutate(across(c(nb2Hits, nb2TotalNo, nb2FalseAlarms, nb2ZFA , nb2ZHIT, nb2dprime, nb2bias), janitor::round_half_up, 3))%>%
  select(PIDN, DCDate, instr_type, task, version, nb2Hits, nb2TotalNo, nb2FalseAlarms, nb2ZFA , nb2ZHIT, nb2dprime, nb2bias) ->
  `2back`

## round all vars to two decimal places for Flanker eprime
lava$enclosedflanker%>%
  mutate(across(c(flkincacc, flkincaccscore, flkinclog, flkincstem, flkincrtscore, flankerinc), janitor::round_half_up, 3))%>%
  select(PIDN, DCDate, instr_type, task, version, flkincacc, flkincaccscore, flkinclog, flkincstem, flkincrtscore, flankerinc) ->
  enclosedflanker 

## round all vars to two decimal places for SetShifting eprime
lava$setshifting%>%
  mutate(across(c(shiftacc, shiftaccscore, shftlog, shftstem, shiftrtscore, shiftscore), janitor::round_half_up, 3))%>%
  select(PIDN, DCDate, instr_type, platform, shiftacc, shiftaccscore, shftlog, shftstem, shiftrtscore, shiftscore) ->
  setshifting

GDS subscores

bedside%>%
  mutate(gds_depression = sum(gds16, gds9, gds3, gds1, gds23, gds7, gds4, gds25, gds15), 
         gds_wav = sum(gds12, gds28, gds2, gds19, gds20, gds21),  
         gds_anxiety = sum(gds8, gds13, gds6, gds18), 
         gds_cognitive = sum(gds14, gds26, gds30, gds29), 
         gds_hopelessness = sum(gds22, gds10, gds17, gds5), 
         gds_agitation = sum(gds11, gds24, gds27)) ->
  bedside

UDS Neuropsych Calculations

# curate the dataset to pass into code from scoreUDS_mirt_toShare.R
data_raw <- bind_rows(
    lava$udsneuropsych,
    lava$udsneuropsychtcog,
    lava$udsneuropsychmoca)%>%
    group_by(PIDN,DCDate)%>%
    arrange(PIDN, DCDate, rowSums(is.na(.)))%>%
    distinct(PIDN, DCDate, .keep_all = TRUE)%>%
    right_join(lava$udssubjectdemo%>%
                 select(PIDN,DCDate,educ,sex), by = c('PIDN','DCDate'))%>%
    mutate('X' = sex-1)%>%
    select(PIDN, educ, age_at_dc, X, sex, animals, veg, udsverfc, udsverlc, digbacct, traila, trailali, trailb, trailbli)%>%
    mutate(across(c(educ ,sex, animals:digbacct, trailali, trailbli),
                  ~ifelse(. %in% c(-2, -3, -4, -5, -6, -7, -8, -9, -10, 88, 95, 96, 97, 98, 99, 888, 8888, 995, 996, 997, 998), NA, .)))%>%
    mutate(across(c(traila, trailb), ~ifelse(. %in% c(-9, -8, -7, -6, 995, 996, 997, 998), NA, .)))%>%
    group_by(PIDN)%>%
    fill(educ, .direction = 'down')%>%
    ungroup()%>%
    rename(naccage = age_at_dc)%>%
    select(-sex)%>%
    rename(sex = X)%>%
    as.data.frame()


### pulled from scoreUDS_mirt_toShare.R --- NO NEED TO run that script if running this 

# Note. Please make sure that your sex variable (named SEX) has been recoded such that male =  0, female = 1

#################################    NO CHANGES NEEDED PAST THIS POINT   ##############################################

#### Clean your data ####
names(data_raw) <- tolower(names(data_raw))  

data_raw$animals[data_raw$animals >70] <- NA
data_raw$veg[data_raw$veg >70] <- NA
data_raw$udsverfc[data_raw$udsverfc >70]<- NA 
data_raw$udsverlc[data_raw$udsverlc >70]<- NA 
data_raw$traila[data_raw$traila >400]<- NA 
data_raw$trailb[data_raw$trailb >400]<- NA 
data_raw$digbacct <- as.integer(data_raw$digbacct)
data_raw$digbacct[data_raw$digbacct >70]<- NA 
data_raw$trailali[data_raw$trailali >70]<- NA 
data_raw$trailbli[data_raw$trailbli >70]<- NA 

data_raw$trailA_ratio <- (data_raw$trailali*60)/data_raw$traila
data_raw$trailB_ratio <- (data_raw$trailbli*60)/data_raw$trailb
#### End clean ####

#### Recode continuous variables to ordinal ####
# Recode script was written by Dan Mungas

var_OG <- c("animals","veg","udsverfc","udsverlc","digbacct", "trailA_ratio", "trailB_ratio")
var_Recode <- lapply(var_OG, paste0, "_r") # add suffix for transformed var names.
var_Recode <-unlist(var_Recode,use.names = FALSE)

recodeLookup <- function(df,varlist_orig, varlist_tr, type="continuous", lookup=NULL,  ### should this be ordinal.
                         lu_type="data") {
  if (is.data.frame(df)){
    rcd <- df
  } else {
    rcd <- eval(parse(text=df))
  }
  if (is.null(lookup)){
    rcdlu <- rcd[,c(varlist_orig,varlist_tr)]
  } else{
    if (is.data.frame(lookup)) {
      if (lu_type == "lookup") {
        rcdlu <- lookup
      } else {
        rcdlu <- lookup[,c(varlist_orig,varlist_tr)]
      }
    } else {
      if (lu_type == "lookup") {
        rcdlu <- eval(parse(text=lookup))
      } else {
        rcdlu <- eval(parse(text=lookup))
        rcdlu <- rcdlu[,c(varlist_orig,varlist_tr)]
      }
    }
  }
  for (j in 1:length(varlist_orig)){
    if (lu_type == "lookup") {
      luv <- rcdlu[,c("recode_score", paste0("min_", varlist_orig[j]), paste0("max_",varlist_orig[j]))]
      names(luv) <- sub("recode_score","tr_min",names(luv))
      luv$tr_max <- luv$tr_min
      names(luv) <- c("tr_min","orig_min","orig_max","tr_max")
      luv <- luv[!is.na(luv$orig_min),]
      for (i in 1:nrow(luv)) {
        if (i < nrow(luv)) {
          luv[i,"orig_max"] <- luv[i+1,"orig_min"]
        } 
      }
    } else {
      t5 <- unique(rcdlu[!is.na(rcdlu[,varlist_tr[j]]),c(varlist_orig[j],varlist_tr[j])])
      t5 <- t5[order(t5[varlist_orig[j]]),]
      luv <- as.data.frame(cbind(t5,rbind(t5[2:nrow(t5),],c(NA,NA))))
      colnames(luv) <- c("orig_min","tr_min","orig_max","tr_max")
    }
    mino <- min(luv[,"orig_min"], na.rm=TRUE)
    mint <- min(luv[,"tr_min"], na.rm=TRUE)
    maxo <- max(luv[,"orig_max"], na.rm=TRUE)
    maxt <- max(luv[,"tr_max"], na.rm=TRUE)
    
    rcd[,varlist_tr[j]] <- ifelse(rcd[,varlist_orig[j]] <= mino,mint,NA)
    rcd[,varlist_tr[j]] <- ifelse(rcd[,varlist_orig[j]] >= maxo,maxt,rcd[,varlist_tr[j]])
    t3 <- rcd[,c(varlist_orig[j],varlist_tr[j])]
    
    if (lu_type == "lookup") {
      sqlcd <- paste("SELECT * FROM t3 AS t3
                       LEFT JOIN
                       (SELECT * FROM luv)
                       AS luv1 ON t3.",varlist_orig[j],
                     " >= luv1.orig_min AND t3.", varlist_orig[j],
                     " < luv1.orig_max",sep="")
    } else {
      sqlcd <- paste("SELECT * FROM t3 AS t3
                       LEFT JOIN
                       (SELECT * FROM luv)
                       AS luv1 ON t3.",varlist_orig[j],
                     " >= luv1.orig_min AND t3.", varlist_orig[j],
                     " < luv1.orig_max",sep="")
    }
    t4 <- sqldf::sqldf(sqlcd)
    if (type == "continuous") {
      t4[,varlist_tr[j]] <- ifelse(!is.na(t4[,varlist_tr[j]]),t4[,varlist_tr[j]],
                                   ifelse(t4[,varlist_orig[j]] >= maxo,maxt,
                                          ( ( (t4[,varlist_orig[j]] - t4$orig_min) / (t4$orig_max - t4$orig_min) ) *
                                              (t4$tr_max - t4$tr_min) ) + t4$tr_min))
    } else {
      t4[,varlist_tr[j]] <- ifelse(!is.na(t4[,varlist_tr[j]]),t4[,varlist_tr[j]],
                                   ifelse(t4[,varlist_orig[j]] >= max(t4$orig_max,na.rm=TRUE),max(t4$tr_max,na.rm=TRUE),
                                          t4$tr_min))
    }
    rcd[,varlist_tr[j]] <- t4[,varlist_tr[j]]
  }
  return(rcd) 
}

luv1 <- read.table(paste(repofolder,"score_uds_folder/UDS3_EF_Lookup_4.21.20_share.csv", sep = ''), header=TRUE, sep=",")

recoded_dat <- recodeLookup(data_raw, var_OG, var_Recode, type="ordinal", lookup=luv1, lu_type="lookup" )

load(paste(repofolder,"score_uds_folder/nacc_fit_MIRT_share.Rdata", sep =''))

countMiss <- function(x) {sum(is.na(x))}

recoded_dat$rowid <- 1:nrow(recoded_dat)
recoded_dat_VarsOnly <- recoded_dat[,var_Recode]
recoded_dat_VarsOnly$rowid <- 1:nrow(recoded_dat_VarsOnly)
recoded_dat_VarsOnly$n_miss <- apply(recoded_dat_VarsOnly[,var_Recode],1, countMiss)

# extracts data frame with records that have non-missing items
recode_cal <- recoded_dat_VarsOnly[recoded_dat_VarsOnly$n_miss < length(var_Recode),]

fsc <- data.frame(mirt::fscores(uds_res_results, response.pattern = recode_cal[,var_Recode],
                           method = "EAP" , QMC=TRUE,full.scores.SE=T,na.rm=F))

fsc$rowid <- recode_cal$rowid
names(fsc) <- sub("F","uds3_ef",names(fsc))

vars2merge<- c("rowid","uds3_ef","SE_uds3_ef")


final_dat <- plyr::join(recoded_dat, fsc[,vars2merge], by = c("rowid"), type ="left")


#### End Recode ####

### Demographic Corrections ####

# Important Disclaimers:
#1. Please note that these scores must be interpretted with extreme caution if the demographics of your sample
#differ from the demographics of the NACC normative dataset, which is highly white (X%),  well educated, and English speaking
#Validation work is underway for individuals with different demographic variables. 

#2. If using with longitudinal data, consider creating a baseline age variable for each person. By using time-varying age, 
#there is the potential that an individuals longitudinal performance is altered due to changing age at each timepoint, which might 
# not be desirable for all studies.

#3. Adjusted scores will only be calculated for a row of data if all three demographic variables are available

# for sex, recode so that male =  0, female = 1

####  Demographic Corrections ####

# Varset should stay the same - these are the variable names from the normative dataset
varset <- c("uds3_ef") 

#varnames refers the the variable names in the dataset on which we are calculating the z-scores. 
varnames <- c("uds3_ef") 

datfull_1 <- as.data.frame(final_dat)

## create new variables from you demographics with the following names (EDUC, NACCAGE, and SEX)
datfull_1$EDUC <- datfull_1$educ
datfull_1$NACCAGE <- as.integer(datfull_1$naccage)
datfull_1$SEX <- as.integer(datfull_1$sex)

## this removes rows that are missing the needed variables. 

datfull_1$rowid <- 1:nrow(datfull_1)

datfull_2 <- datfull_1 %>% 
  filter_at(vars("uds3_ef", "EDUC", "NACCAGE", "SEX"), all_vars(!is.na(.)))

## Bringing in extreme predictor values
for(i in 1:nrow(datfull_2)){
  if(datfull_2$NACCAGE[i] < 49.5) datfull_2$NACCAGE[i] <- 50
  if(datfull_2$NACCAGE[i] > 90.5) datfull_2$NACCAGE[i] <- 90
  if(datfull_2$EDUC[i] < 9.5) datfull_2$EDUC[i] <- 10
  if(datfull_2$EDUC[i] > 20.5) datfull_2$EDUC[i] <- 20
}

zscore <- function(x,xbar,s){
  (x-xbar)/s
}

for (i in 1:length(varset)){
  lookup <- read.csv(paste(repofolder,'score_uds_folder/LookUps/',varset[i],"_lookup.csv", sep=""))
  label1 <- as.character(paste(varset[i],"_mean.adj", sep=""))
  lookup[label1] <- lookup$mean.adj           ### Need to use brackets to assign a character as a variable name. 
  label2 <- paste(varset[i],"_sd.adj", sep="")
  lookup[label2] <- lookup$sd.adj
  lookup$sd.adj <- NULL
  lookup$mean.adj <- NULL
  
  label3 <- as.character(varnames[i])
  
  
  merged_data <- plyr::join(datfull_2, lookup, by = c("NACCAGE", "EDUC","SEX"), type ="left")
  
  datfull_2[label1] <- merged_data[label1]
  datfull_2[label2] <- merged_data[label2]
  zscore_label <- as.character(paste(varset[i],"_Z", sep=""))

  datfull_2[zscore_label] <-  zscore(datfull_2[label3], datfull_2[label1], datfull_2[label2]  )
  datfull_2[zscore_label][is.na(datfull_2[label3])] <- "NA"
}

vars2merge_demo<- c("rowid", "uds3_ef_mean.adj", "uds3_ef_sd.adj", "uds3_ef_Z")

#bv_data_UDS_recoded <- as.data.frame(bv_data_UDS_recoded)

final_dat_demo <- plyr::join(datfull_1, datfull_2[,vars2merge_demo], by = c("rowid"), type ="left")


# convert to tibble and rename columns 
final_dat_demo<-final_dat_demo%>%as_tibble()%>%rename(DCDate = dcdate, PIDN = pidn)

#saving output of scoring from code from scoreUDS_mirt_toShare.R to the curated dataset requested
bind_rows(
    lava$udsneuropsych,
    lava$udsneuropsychtcog,
    lava$udsneuropsychmoca)%>%
    select(PIDN, DCDate, instr_type, form_ver, v_type, traila, trailarr, trailali, trailb, trailbrr, trailbli, wais, 
           memunits, pentagon, craftvrs, crafturs, udsbentc, digforct, craftdvr, craftdre,
           udsbentd, udsbenrs, minttots, udsvertn, cogstat, logimem:boston, mocatots, mmse_zscore:boston_zscore)%>%
    mutate(across(c(traila:mocatots),
                  ~ifelse(. %in% c(-1, -2, -5, -6, -7, -8, -9, 995, 996, 997, 998), NA, .)))%>%
    mutate(across(c(trailarr, trailali, trailbrr, trailbli, memunits:mocatots),
                  ~ifelse(. %in% c(88, 95, 96, 97, 98, 99), NA, .)))%>%
    mutate(digib = na_if(digib, 70))%>%
    group_by(PIDN,DCDate)%>%
    arrange(PIDN, DCDate, rowSums(is.na(.)))%>%
    distinct(PIDN,DCDate, .keep_all = TRUE)%>%
    left_join(final_dat_demo%>%
                  as_tibble())%>%
  select(-ends_with('_r'))%>%
  relocate(uds3_ef,
           SE_uds3_ef,
           uds3_ef_Z,
           uds3_ef_mean.adj,
           uds3_ef_sd.adj,
           digiflen_zscore,
           digiblen_zscore,
           mmse_zscore:boston_zscore,
           .after = v_type)%>%
  select(-c(instr_type, form_ver, EDUC, NACCAGE, SEX, educ, naccage, sex, rowid)) ->
  uds_neuropsych

Compile List of DFs

  • All dfs here will be joined to timepoints

    • Genetics, diagnosis_latest, demographics won’t be joined the same way as these, so they’re excluded
curated_dfs_named <- lst(
  
#beside data and composites
bedside,

#uds_neuorpsych
uds_neuropsych,

## infoprocspeed
infoprocessingspeed,

##1 Back
`1back`,

## 2 Back
`2back`,

## Flanker eprime
enclosedflanker,

## SetShifting eprime
setshifting,

## neuroexam
adrcneuroexam,

## uds health history
health_history,

## uds medical conditions
med_conditions,

## hbudsphysical 
physical,

## uds medications prescription
medications,

## uds otc meds
otc_meds,

## uds vitamins data
supplements,

## Need to calc composite scoring for this
brainhealthassessment,

## cdr
cdr,

## champs
champs,

## CAS -- scored
cogntive_activity_scale,

## cpt
cpt,

## synaptic_markers
csf,

## diagnosis
diagnosis,

## early dev history
earlydevhistory,

## everydaycogself
everydaycogself,

## faq
faq,

## fisherman story
fishermanstory,

## fitbit data
fitbit,

## GAD
gad,

## GAIT
gait, 

## Grit
grit,

## hbclinicallabs_myelinucddra
clinical_labs,

## pet_data
pet,

## MRI data
# schaefer_imaging = imaging_ip$schaefer_imaging,
imaging_wmh,
imaging_pasl,
imaging_pcasl,
# diffusion_tensor_imaging = imaging_ip$imaging_dti_joined,
# cerebral_blood_flow_pulsed_ASL = imaging_ip$pasl_combined, 
# `cerebral_blood_flow_pseudo-continuous_ASL` = imaging_ip$pcasl_combined, 
# imaging_gm_all_production = imaging_ip$`mri_box_data_modified_clean_names$imaging_gm_all_production`,
# imaging_cbf_pvc_gm = imaging_ip$`mri_box_data_modified_clean_names$imaging_cbf_pvc_gm`,
# imaging_t1 = imaging_ip$`old_imaging$imaging_t1`,


## MAAS
maas_mindfulness,

## mesoscale
mesoscale,

## mind diet
diet,

## npi -- joining the two npi instruments
npi,

## PASE
pase,

## pattern separation
patternseparation,

## psqi
psqi,

#PSS
pss,

## berlin sleep 
berlin_sleep,

## edinburgh_handedness_inventory 
edinburgh_handedness,

## epworth sleepiness scale
epworth_sleepiness,

## insomnia severity
insomnia_severity_index,

## other sleep instruments not collected by hillblom
sleep_instruments,

## sleep apnea
apnea,

## sleep profiler
sleep_profiler,

## sex and reproduction
sex_and_reproductive_health,
  
## sni--only have scoring for some of the varibles included previously
social_network_index, 

## quanterix
quanterix,

## quanterix_fitbit
quanterix_fitbit,

## bu_subset_data
bu_subset_data,

## osu_tbi
osu_tbi_locpta_subset_data,

## tabcat_data
tabcat_bha,
tabcat_animal_fluency,

## consolidated dot counting data
dot_counting,

## tabcat_data
tabcat_flanker, 
tabcat_ll, 
tabcat_lo,
tabcat_match,
tabcat_rapid_naming,
tabcat_running_dots,
tabcat_set_shifting,
tabcat_fav,

## virtual_bedside (from qualtrics)
virtual_bedside)

Final cleaning

  • combine any dfs if identical names; this shouldn’t do anything but is kept as a safety check
  • pass all dfs throught clean instrument; this prioritizes duplicated data with the least number of NAs
# combines dfs with same name
combiner(curated_dfs_named) ->
  curated_dfs_named

# keeps duplicated data with least NA values
map(curated_dfs_named, clean_instrument) -> curated_dfs_named

Step 3

Image

Set Attributes

  • If not set, prompted to enter responses
#get all dfs 
list_of_dfs <- c(if(exists('timepoints_dfs')) pluck(timepoints_dfs), if(exists('original_data_no_tps')) pluck(original_data_no_tps))
#strip 'original_data' from name. might not be needed since we set names later on
names(list_of_dfs) <- str_replace(names(list_of_dfs), "^original_data_", "")

# set attributes for datasets

attr(list_of_dfs$fitbit, "dataset_name") <- 'fitbit_dataset'
attr(list_of_dfs$BrANCH, "dataset_name") <- 'BrANCH_dataset'
attr(list_of_dfs$specimens, "dataset_name") <- 'specimens_dataset'
attr(list_of_dfs$sleep, "dataset_name") <- 'sleep_dataset'

attr(list_of_dfs$fitbit, 'df_suffix') <- 'fitbit'
attr(list_of_dfs$specimens, 'df_suffix') <- 'specimens'
attr(list_of_dfs$sleep, 'df_suffix') <- 'sleep'

attr(list_of_dfs$fitbit, "run_chunk_name_in_curated_dfs") <- 1
attr(list_of_dfs$specimens, 'run_chunk_name_in_curated_dfs') <- 2
attr(list_of_dfs$sleep, 'run_chunk_name_in_curated_dfs') <- 2

Dataset Linking

curated_dfs_named%>%
  keep(~any(names(.x) == 'PIDN'))%>%
  map(~.x%>%mutate(PIDN = as.double(PIDN))) -> curated_dfs_named 

datasets <- list()

datasets <- flatten(map(names(list_of_dfs), function(df_name) {
  # Access the dataframe using its name
  anchor_df <- list_of_dfs[[df_name]]
  
  #reset suffix
  df_suffix <- ""
  
  anchor_data <- anchor_df
  # Check if the dataframe has columns other than PIDN and DCDate
  if (ncol(ungroup(anchor_df)) > 2 & is.null(attr(anchor_df, 'df_suffix'))) {
    # Ask the user to enter a suffix for column names
    df_suffix <- readline(paste0("Enter suffix to append to column names in '", df_name,
                                 "' if linking instrument data along with the timepoints:"))
    anchor_data <- anchor_df %>%
      rename_with(
        ~ paste(., df_suffix, sep = '.'),
        -c('PIDN', 'DCDate'))
    
    if (is.null(attr(anchor_df, "run_chunk_name_in_curated_dfs"))) {
      # Use the name of the dataframe in the title of the menu
      run_chunk_name_in_curated_dfs <- utils::menu(
        choices = c("Yes", "No"),
        title = paste0("Is data from '", df_name, "' already in curated_dfs_named?", sep = ' '))
      } else {
        run_chunk_name_in_curated_dfs <- attr(anchor_df, "run_chunk_name_in_curated_dfs")
        } 
    
    if (run_chunk_name_in_curated_dfs == 1) {
      unmodified_curated_dfs <- curated_dfs_named
      curated_dfs_named <- discard(curated_dfs_named, names(curated_dfs_named) %in% df_suffix)
    }
    
    } else if (ncol(ungroup(anchor_df)) > 2 & !is.null(attr(anchor_df, 'df_suffix'))) {
      df_suffix <- as.character(attr(anchor_df, 'df_suffix'))
      anchor_data <- anchor_df %>%
        rename_with(
          ~ paste(., df_suffix, sep = '.'),
          -c('PIDN', 'DCDate'))
      
      if (is.null(attr(anchor_df, "run_chunk_name_in_curated_dfs"))) {
        # Use the name of the dataframe in the title of the menu
        run_chunk_name_in_curated_dfs <- utils::menu(
          choices = c("Yes", "No"),
          title = paste0("Is data from '", df_name, "' already in curated_dfs_named?", sep = ' '))
        } else {
          run_chunk_name_in_curated_dfs <- attr(anchor_df, "run_chunk_name_in_curated_dfs") 
          } 
      
      if (run_chunk_name_in_curated_dfs == 1) {
        unmodified_curated_dfs <- curated_dfs_named
        curated_dfs_named <- discard(curated_dfs_named, names(curated_dfs_named) %in% df_suffix)
      }
    }

  vasc_burden <- get_vasc_burden(select(anchor_df, PIDN, DCDate))
  
  joined_data <- join_list_of_dfs_to_timepoints(select(anchor_df, PIDN, DCDate), curated_dfs_named%>%append(lst(vasc_burden)), 365)
  
  dataset_joined <- left_join(anchor_data, joined_data, by = c('PIDN', 'DCDate'))
  
  dataset_joined_no_date_data <- dataset_joined%>%
    left_join(curated_dfs_named%>%
                keep(~any(names(.x) == 'PIDN', na.rm = T))%>%
                discard(~any(names(.x) == 'DCDate', na.rm = T))%>%
                #renaming variables in list of dataframes to include .df after the name--keeps track of where data came from
                imap(function(x, y){
                  x %>% rename_with(~paste(.,y, sep = '.'), -c('PIDN'))})%>%
                reduce(., full_join, by = 'PIDN'), by = 'PIDN')
  
  # Request user to set variable name
  if (is.null(attr(anchor_df, "dataset_name"))) {
    dataset_name <- readline(paste0("Please name the ", df_name, " dataset:", sep = " "))
  } else {
    dataset_name <- attr(anchor_df, "dataset_name")
  }

  # Define variable with user-specified name and assign that name to the final df
  assign(dataset_name, 
         #joining RFM and demographics to the front of the master df. 
         dataset_joined_no_date_data%>%
           filter(any(!is.na(DCDate)))%>%
           drop_na(DCDate)%>%
           left_join(demographics_curated_df%>%
                       rename_with(~paste(.,'demographics',sep = '.'),-c('PIDN')), by = 'PIDN')%>%
           mutate(dob.demographics = as_date(dob.demographics),
                  age_at_DCDate = eeptools::age_calc(dob.demographics, DCDate, units = 'years', precise = FALSE))%>%
           select(-dob.demographics)%>%
           relocate(matches('.demographics'), .after = ncol(anchor_data)) %>% # Move this line up
           left_join(diagnosis_latest%>%
                       rename_with(~paste(., 'diagnosis_latest', sep = '.'), -c('PIDN')), by = 'PIDN')%>%
           relocate(matches('.diagnosis_latest'), .after = matches("res_dx_b.diagnosis"))%>%
           left_join(genetics_data%>%
                       rename_with(~paste(.,'genetics',sep = '.'), -c('PIDN')), by = 'PIDN')%>%
           relocate(PIDN : DCDate, age_at_DCDate)%>%
           mutate(UnQID = as.numeric(str_c(PIDN,as.numeric(as_date(DCDate, format = '%m/%d/%Y') - as_date('1899-12-30', format = '%Y-%m-%d')))))%>%
           relocate(UnQID, DCDate, age_at_DCDate, .after = PIDN)%>%
           remove_empty(c("rows", "cols")))
  
  #remove unmodified_curated_dfs if it was created and set curated_dfs back to itself
  if(exists('unmodified_curated_dfs')) {
    curated_dfs_named <- unmodified_curated_dfs
    remove(unmodified_curated_dfs)
    }
  set_names(lst(get(dataset_name)), dataset_name)
  }))

Save Datasets

  • Will not save if RDS file exists
filename <- str_c(datafolder, "datasets_", today(), ".RDS")

if (!file.exists(filename)) {
  saveRDS(datasets, filename)
  }

Run App

  • Not publishable in Markdown
  • Can host in server and share